rm(list=ls())
options(warn=-1)
library(tidyverse)
-- Attaching packages --------------------------------------- tidyverse 1.3.0 -- v ggplot2 3.3.2 v purrr 0.3.4 v tibble 3.0.3 v dplyr 1.0.2 v tidyr 1.1.2 v stringr 1.4.0 v readr 1.3.1 v forcats 0.5.0 -- Conflicts ------------------------------------------ tidyverse_conflicts() -- x dplyr::filter() masks stats::filter() x dplyr::lag() masks stats::lag()
Now, we will read all the necessory CSV files and save the data which we need to use.
sales = read_csv("sales_train.csv")
head(sales)
Parsed with column specification: cols( date = col_character(), date_block_num = col_double(), shop_id = col_double(), item_id = col_double(), item_price = col_double(), item_cnt_day = col_double() )
| date | date_block_num | shop_id | item_id | item_price | item_cnt_day |
|---|---|---|---|---|---|
| <chr> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> |
| 02.01.2013 | 0 | 59 | 22154 | 999.00 | 1 |
| 03.01.2013 | 0 | 25 | 2552 | 899.00 | 1 |
| 05.01.2013 | 0 | 25 | 2552 | 899.00 | -1 |
| 06.01.2013 | 0 | 25 | 2554 | 1709.05 | 1 |
| 15.01.2013 | 0 | 25 | 2555 | 1099.00 | 1 |
| 10.01.2013 | 0 | 25 | 2564 | 349.00 | 1 |
items = read_csv("items.csv")
head(items)
Parsed with column specification: cols( item_name = col_character(), item_id = col_double(), item_category_id = col_double() )
| item_name | item_id | item_category_id |
|---|---|---|
| <chr> | <dbl> | <dbl> |
| ! <U+0412><U+041E> <U+0412><U+041B><U+0410><U+0421><U+0422><U+0418> <U+041D><U+0410><U+0412><U+0410><U+0416><U+0414><U+0415><U+041D><U+0418><U+042F> (<U+041F><U+041B><U+0410><U+0421><U+0422>.) D | 0 | 40 |
| !ABBYY FineReader 12 Professional Edition Full [PC, <U+0426><U+0438><U+0444><U+0440><U+043E><U+0432><U+0430><U+044F> <U+0432><U+0435><U+0440><U+0441><U+0438><U+044F>] | 1 | 76 |
| ***<U+0412> <U+041B><U+0423><U+0427><U+0410><U+0425> <U+0421><U+041B><U+0410><U+0412><U+042B> (UNV) D | 2 | 40 |
| ***<U+0413><U+041E><U+041B><U+0423><U+0411><U+0410><U+042F> <U+0412><U+041E><U+041B><U+041D><U+0410> (Univ) D | 3 | 40 |
| ***<U+041A><U+041E><U+0420><U+041E><U+0411><U+041A><U+0410> (<U+0421><U+0422><U+0415><U+041A><U+041B><U+041E>) D | 4 | 40 |
| ***<U+041D><U+041E><U+0412><U+042B><U+0415> <U+0410><U+041C><U+0415><U+0420><U+0418><U+041A><U+0410><U+041D><U+0421><U+041A><U+0418><U+0415> <U+0413><U+0420><U+0410><U+0424><U+0424><U+0418><U+0422><U+0418> (UNI) D | 5 | 40 |
shops = read_csv("shops.csv")
head(shops)
Parsed with column specification: cols( shop_name = col_character(), shop_id = col_double() )
| shop_name | shop_id |
|---|---|
| <chr> | <dbl> |
| !<U+042F><U+043A><U+0443><U+0442><U+0441><U+043A> <U+041E><U+0440><U+0434><U+0436><U+043E><U+043D><U+0438><U+043A><U+0438><U+0434><U+0437><U+0435>, 56 <U+0444><U+0440><U+0430><U+043D> | 0 |
| !<U+042F><U+043A><U+0443><U+0442><U+0441><U+043A> <U+0422><U+0426> "<U+0426><U+0435><U+043D><U+0442><U+0440><U+0430><U+043B><U+044C><U+043D><U+044B><U+0439>" <U+0444><U+0440><U+0430><U+043D> | 1 |
| <U+0410><U+0434><U+044B><U+0433><U+0435><U+044F> <U+0422><U+0426> "<U+041C><U+0435><U+0433><U+0430>" | 2 |
| <U+0411><U+0430><U+043B><U+0430><U+0448><U+0438><U+0445><U+0430> <U+0422><U+0420><U+041A> "<U+041E><U+043A><U+0442><U+044F><U+0431><U+0440><U+044C>-<U+041A><U+0438><U+043D><U+043E><U+043C><U+0438><U+0440>" | 3 |
| <U+0412><U+043E><U+043B><U+0436><U+0441><U+043A><U+0438><U+0439> <U+0422><U+0426> "<U+0412><U+043E><U+043B><U+0433><U+0430> <U+041C><U+043E><U+043B><U+043B>" | 4 |
| <U+0412><U+043E><U+043B><U+043E><U+0433><U+0434><U+0430> <U+0422><U+0420><U+0426> "<U+041C><U+0430><U+0440><U+043C><U+0435><U+043B><U+0430><U+0434>" | 5 |
item_categories = read_csv("item_categories.csv" )
head(item_categories)
Parsed with column specification: cols( item_category_name = col_character(), item_category_id = col_double() )
| item_category_name | item_category_id |
|---|---|
| <chr> | <dbl> |
| PC - <U+0413><U+0430><U+0440><U+043D><U+0438><U+0442><U+0443><U+0440><U+044B>/<U+041D><U+0430><U+0443><U+0448><U+043D><U+0438><U+043A><U+0438> | 0 |
| <U+0410><U+043A><U+0441><U+0435><U+0441><U+0441><U+0443><U+0430><U+0440><U+044B> - PS2 | 1 |
| <U+0410><U+043A><U+0441><U+0435><U+0441><U+0441><U+0443><U+0430><U+0440><U+044B> - PS3 | 2 |
| <U+0410><U+043A><U+0441><U+0435><U+0441><U+0441><U+0443><U+0430><U+0440><U+044B> - PS4 | 3 |
| <U+0410><U+043A><U+0441><U+0435><U+0441><U+0441><U+0443><U+0430><U+0440><U+044B> - PSP | 4 |
| <U+0410><U+043A><U+0441><U+0435><U+0441><U+0441><U+0443><U+0430><U+0440><U+044B> - PSVita | 5 |
# Joining the items, shops and item_categories data to the sales data
merged_sales = sales %>%
inner_join(items, by = "item_id") %>%
inner_join(shops, by = "shop_id") %>%
inner_join(item_categories, by = "item_category_id")
head(merged_sales)
| date | date_block_num | shop_id | item_id | item_price | item_cnt_day | item_name | item_category_id | shop_name | item_category_name |
|---|---|---|---|---|---|---|---|---|---|
| <chr> | <dbl> | <dbl> | <dbl> | <dbl> | <dbl> | <chr> | <dbl> | <chr> | <chr> |
| 02.01.2013 | 0 | 59 | 22154 | 999.00 | 1 | <U+042F><U+0412><U+041B><U+0415><U+041D><U+0418><U+0415> 2012 (BD) | 37 | <U+042F><U+0440><U+043E><U+0441><U+043B><U+0430><U+0432><U+043B><U+044C> <U+0422><U+0426> "<U+0410><U+043B><U+044C><U+0442><U+0430><U+0438><U+0440>" | <U+041A><U+0438><U+043D><U+043E> - Blu-Ray |
| 03.01.2013 | 0 | 25 | 2552 | 899.00 | 1 | DEEP PURPLE The House Of Blue Light LP | 58 | <U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0420><U+041A> "<U+0410><U+0442><U+0440><U+0438><U+0443><U+043C>" | <U+041C><U+0443><U+0437><U+044B><U+043A><U+0430> - <U+0412><U+0438><U+043D><U+0438><U+043B> |
| 05.01.2013 | 0 | 25 | 2552 | 899.00 | -1 | DEEP PURPLE The House Of Blue Light LP | 58 | <U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0420><U+041A> "<U+0410><U+0442><U+0440><U+0438><U+0443><U+043C>" | <U+041C><U+0443><U+0437><U+044B><U+043A><U+0430> - <U+0412><U+0438><U+043D><U+0438><U+043B> |
| 06.01.2013 | 0 | 25 | 2554 | 1709.05 | 1 | DEEP PURPLE Who Do You Think We Are LP | 58 | <U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0420><U+041A> "<U+0410><U+0442><U+0440><U+0438><U+0443><U+043C>" | <U+041C><U+0443><U+0437><U+044B><U+043A><U+0430> - <U+0412><U+0438><U+043D><U+0438><U+043B> |
| 15.01.2013 | 0 | 25 | 2555 | 1099.00 | 1 | DEEP PURPLE 30 Very Best Of 2CD (<U+0424><U+0438><U+0440><U+043C>.) | 56 | <U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0420><U+041A> "<U+0410><U+0442><U+0440><U+0438><U+0443><U+043C>" | <U+041C><U+0443><U+0437><U+044B><U+043A><U+0430> - CD <U+0444><U+0438><U+0440><U+043C><U+0435><U+043D><U+043D><U+043E><U+0433><U+043E> <U+043F><U+0440><U+043E><U+0438><U+0437><U+0432><U+043E><U+0434><U+0441><U+0442><U+0432><U+0430> |
| 10.01.2013 | 0 | 25 | 2564 | 349.00 | 1 | DEEP PURPLE Perihelion: Live In Concert DVD (<U+041A><U+0438><U+0440>.) | 59 | <U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0420><U+041A> "<U+0410><U+0442><U+0440><U+0438><U+0443><U+043C>" | <U+041C><U+0443><U+0437><U+044B><U+043A><U+0430> - <U+041C><U+0443><U+0437><U+044B><U+043A><U+0430><U+043B><U+044C><U+043D><U+043E><U+0435> <U+0432><U+0438><U+0434><U+0435><U+043E> |
# Seprate dates into month, day, year
library(lubridate)
merged_sales$date = dmy(merged_sales$date)
merged_sales$year = year(merged_sales$date)
merged_sales$month = month(merged_sales$date)
merged_sales$day = day(merged_sales$date)
merged_sales$weekday = weekdays(merged_sales$date)
Attaching package: 'lubridate'
The following objects are masked from 'package:base':
date, intersect, setdiff, union
# Converting the columns into factors
merged_sales$year = as.factor(merged_sales$year)
merged_sales$month = as.factor(merged_sales$month)
merged_sales$weekday = as.factor(merged_sales$weekday)
merged_sales$shop_id = as.factor(merged_sales$shop_id)
merged_sales$item_id = as.factor(merged_sales$item_id)
merged_sales$item_category_id = as.factor(merged_sales$item_category_id)
str(merged_sales)
tibble [2,935,849 x 14] (S3: spec_tbl_df/tbl_df/tbl/data.frame) $ date : Date[1:2935849], format: "2013-01-02" "2013-01-03" ... $ date_block_num : num [1:2935849] 0 0 0 0 0 0 0 0 0 0 ... $ shop_id : Factor w/ 60 levels "0","1","2","3",..: 60 26 26 26 26 26 26 26 26 26 ... $ item_id : Factor w/ 21807 levels "0","1","2","3",..: 21792 2496 2496 2498 2499 2508 2509 2515 2515 2516 ... $ item_price : num [1:2935849] 999 899 899 1709 1099 ... $ item_cnt_day : num [1:2935849] 1 1 -1 1 1 1 1 1 1 3 ... $ item_name : chr [1:2935849] "<U+042F><U+0412><U+041B><U+0415><U+041D><U+0418><U+0415> 2012 (BD)" "DEEP PURPLE The House Of Blue Light LP" "DEEP PURPLE The House Of Blue Light LP" "DEEP PURPLE Who Do You Think We Are LP" ... $ item_category_id : Factor w/ 84 levels "0","1","2","3",..: 38 59 59 59 57 60 57 56 56 56 ... $ shop_name : chr [1:2935849] "<U+042F><U+0440><U+043E><U+0441><U+043B><U+0430><U+0432><U+043B><U+044C> <U+0422><U+0426> \"<U+0410><U+043B><U+"| __truncated__ "<U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0420><U+041A> \"<U+0410><U+0442><U+0440><U+0438><U+0443><U+043C>\"" "<U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0420><U+041A> \"<U+0410><U+0442><U+0440><U+0438><U+0443><U+043C>\"" "<U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0420><U+041A> \"<U+0410><U+0442><U+0440><U+0438><U+0443><U+043C>\"" ... $ item_category_name: chr [1:2935849] "<U+041A><U+0438><U+043D><U+043E> - Blu-Ray" "<U+041C><U+0443><U+0437><U+044B><U+043A><U+0430> - <U+0412><U+0438><U+043D><U+0438><U+043B>" "<U+041C><U+0443><U+0437><U+044B><U+043A><U+0430> - <U+0412><U+0438><U+043D><U+0438><U+043B>" "<U+041C><U+0443><U+0437><U+044B><U+043A><U+0430> - <U+0412><U+0438><U+043D><U+0438><U+043B>" ... $ year : Factor w/ 3 levels "2013","2014",..: 1 1 1 1 1 1 1 1 1 1 ... $ month : Factor w/ 12 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ... $ day : int [1:2935849] 2 3 5 6 15 10 2 4 11 3 ... $ weekday : Factor w/ 7 levels "Friday","Monday",..: 7 5 3 4 6 5 7 1 1 5 ... - attr(*, "spec")= .. cols( .. date = col_character(), .. date_block_num = col_double(), .. shop_id = col_double(), .. item_id = col_double(), .. item_price = col_double(), .. item_cnt_day = col_double() .. )
# Check for missing data
any(is.na(merged_sales))
This is necessory just to make sure we don't have any NAs which can cause problems in the future.
#Total items by month
aggregate(item_cnt_day~month, merged_sales, sum)
| month | item_cnt_day |
|---|---|
| <fct> | <dbl> |
| 1 | 359349 |
| 2 | 321975 |
| 3 | 344453 |
| 4 | 281573 |
| 5 | 277055 |
| 6 | 286924 |
| 7 | 271433 |
| 8 | 294091 |
| 9 | 305383 |
| 10 | 306019 |
| 11 | 247854 |
| 12 | 352097 |
# This will give number of distinct shops
no_of_shops = merged_sales %>%
select(shop_id) %>%
distinct() %>%
summarise(shops=n())
no_of_shops
| shops |
|---|
| <int> |
| 60 |
# This will give number of times a shop apperars in the data set.
shop_frequency = merged_sales %>%
group_by(shop_id,shop_name) %>%
summarise(number=n())
shop_frequency
`summarise()` regrouping output by 'shop_id' (override with `.groups` argument)
| shop_id | shop_name | number |
|---|---|---|
| <fct> | <chr> | <int> |
| 0 | !<U+042F><U+043A><U+0443><U+0442><U+0441><U+043A> <U+041E><U+0440><U+0434><U+0436><U+043E><U+043D><U+0438><U+043A><U+0438><U+0434><U+0437><U+0435>, 56 <U+0444><U+0440><U+0430><U+043D> | 9857 |
| 1 | !<U+042F><U+043A><U+0443><U+0442><U+0441><U+043A> <U+0422><U+0426> "<U+0426><U+0435><U+043D><U+0442><U+0440><U+0430><U+043B><U+044C><U+043D><U+044B><U+0439>" <U+0444><U+0440><U+0430><U+043D> | 5678 |
| 2 | <U+0410><U+0434><U+044B><U+0433><U+0435><U+044F> <U+0422><U+0426> "<U+041C><U+0435><U+0433><U+0430>" | 25991 |
| 3 | <U+0411><U+0430><U+043B><U+0430><U+0448><U+0438><U+0445><U+0430> <U+0422><U+0420><U+041A> "<U+041E><U+043A><U+0442><U+044F><U+0431><U+0440><U+044C>-<U+041A><U+0438><U+043D><U+043E><U+043C><U+0438><U+0440>" | 25532 |
| 4 | <U+0412><U+043E><U+043B><U+0436><U+0441><U+043A><U+0438><U+0439> <U+0422><U+0426> "<U+0412><U+043E><U+043B><U+0433><U+0430> <U+041C><U+043E><U+043B><U+043B>" | 38242 |
| 5 | <U+0412><U+043E><U+043B><U+043E><U+0433><U+0434><U+0430> <U+0422><U+0420><U+0426> "<U+041C><U+0430><U+0440><U+043C><U+0435><U+043B><U+0430><U+0434>" | 38179 |
| 6 | <U+0412><U+043E><U+0440><U+043E><U+043D><U+0435><U+0436> (<U+041F><U+043B><U+0435><U+0445><U+0430><U+043D><U+043E><U+0432><U+0441><U+043A><U+0430><U+044F>, 13) | 82663 |
| 7 | <U+0412><U+043E><U+0440><U+043E><U+043D><U+0435><U+0436> <U+0422><U+0420><U+0426> "<U+041C><U+0430><U+043A><U+0441><U+0438><U+043C><U+0438><U+0440>" | 58076 |
| 8 | <U+0412><U+043E><U+0440><U+043E><U+043D><U+0435><U+0436> <U+0422><U+0420><U+0426> <U+0421><U+0438><U+0442><U+0438>-<U+041F><U+0430><U+0440><U+043A> "<U+0413><U+0440><U+0430><U+0434>" | 3412 |
| 9 | <U+0412><U+044B><U+0435><U+0437><U+0434><U+043D><U+0430><U+044F> <U+0422><U+043E><U+0440><U+0433><U+043E><U+0432><U+043B><U+044F> | 3751 |
| 10 | <U+0416><U+0443><U+043A><U+043E><U+0432><U+0441><U+043A><U+0438><U+0439> <U+0443><U+043B>. <U+0427><U+043A><U+0430><U+043B><U+043E><U+0432><U+0430> 39<U+043C>? | 21397 |
| 11 | <U+0416><U+0443><U+043A><U+043E><U+0432><U+0441><U+043A><U+0438><U+0439> <U+0443><U+043B>. <U+0427><U+043A><U+0430><U+043B><U+043E><U+0432><U+0430> 39<U+043C>² | 499 |
| 12 | <U+0418><U+043D><U+0442><U+0435><U+0440><U+043D><U+0435><U+0442>-<U+043C><U+0430><U+0433><U+0430><U+0437><U+0438><U+043D> <U+0427><U+0421> | 34694 |
| 13 | <U+041A><U+0430><U+0437><U+0430><U+043D><U+044C> <U+0422><U+0426> "<U+0411><U+0435><U+0445><U+0435><U+0442><U+043B><U+0435>" | 17824 |
| 14 | <U+041A><U+0430><U+0437><U+0430><U+043D><U+044C> <U+0422><U+0426> "<U+041F><U+0430><U+0440><U+043A><U+0425><U+0430><U+0443><U+0441>" II | 36979 |
| 15 | <U+041A><U+0430><U+043B><U+0443><U+0433><U+0430> <U+0422><U+0420><U+0426> "XXI <U+0432><U+0435><U+043A>" | 59511 |
| 16 | <U+041A><U+043E><U+043B><U+043E><U+043C><U+043D><U+0430> <U+0422><U+0426> "<U+0420><U+0438><U+043E>" | 52734 |
| 17 | <U+041A><U+0440><U+0430><U+0441><U+043D><U+043E><U+044F><U+0440><U+0441><U+043A> <U+0422><U+0426> "<U+0412><U+0437><U+043B><U+0435><U+0442><U+043A><U+0430> <U+041F><U+043B><U+0430><U+0437><U+0430>" | 22950 |
| 18 | <U+041A><U+0440><U+0430><U+0441><U+043D><U+043E><U+044F><U+0440><U+0441><U+043A> <U+0422><U+0426> "<U+0418><U+044E><U+043D><U+044C>" | 53227 |
| 19 | <U+041A><U+0443><U+0440><U+0441><U+043A> <U+0422><U+0426> "<U+041F><U+0443><U+0448><U+043A><U+0438><U+043D><U+0441><U+043A><U+0438><U+0439>" | 63911 |
| 20 | <U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> "<U+0420><U+0430><U+0441><U+043F><U+0440><U+043E><U+0434><U+0430><U+0436><U+0430>" | 1792 |
| 21 | <U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+041C><U+0422><U+0420><U+0426> "<U+0410><U+0444><U+0438> <U+041C><U+043E><U+043B><U+043B>" | 58133 |
| 22 | <U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+041C><U+0430><U+0433><U+0430><U+0437><U+0438><U+043D> <U+0421>21 | 45434 |
| 23 | <U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+041A> "<U+0411><U+0443><U+0434><U+0435><U+043D><U+043E><U+0432><U+0441><U+043A><U+0438><U+0439>" (<U+043F><U+0430><U+0432>.<U+0410>2) | 6963 |
| 24 | <U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+041A> "<U+0411><U+0443><U+0434><U+0435><U+043D><U+043E><U+0432><U+0441><U+043A><U+0438><U+0439>" (<U+043F><U+0430><U+0432>.<U+041A>7) | 53032 |
| 25 | <U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0420><U+041A> "<U+0410><U+0442><U+0440><U+0438><U+0443><U+043C>" | 186104 |
| 26 | <U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0426> "<U+0410><U+0440><U+0435><U+0430><U+043B>" (<U+0411><U+0435><U+043B><U+044F><U+0435><U+0432><U+043E>) | 53910 |
| 27 | <U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0426> "<U+041C><U+0415><U+0413><U+0410> <U+0411><U+0435><U+043B><U+0430><U+044F> <U+0414><U+0430><U+0447><U+0430> II" | 105366 |
| 28 | <U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0426> "<U+041C><U+0415><U+0413><U+0410> <U+0422><U+0435><U+043F><U+043B><U+044B><U+0439> <U+0421><U+0442><U+0430><U+043D>" II | 142234 |
| 29 | <U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0426> "<U+041D><U+043E><U+0432><U+044B><U+0439> <U+0432><U+0435><U+043A>" (<U+041D><U+043E><U+0432><U+043E><U+043A><U+043E><U+0441><U+0438><U+043D><U+043E>) | 49225 |
| 30 | <U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0426> "<U+041F><U+0435><U+0440><U+043B><U+043E><U+0432><U+0441><U+043A><U+0438><U+0439>" | 50860 |
| 31 | <U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0426> "<U+0421><U+0435><U+043C><U+0435><U+043D><U+043E><U+0432><U+0441><U+043A><U+0438><U+0439>" | 235636 |
| 32 | <U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0426> "<U+0421><U+0435><U+0440><U+0435><U+0431><U+0440><U+044F><U+043D><U+044B><U+0439> <U+0414><U+043E><U+043C>" | 7947 |
| 33 | <U+041C><U+044B><U+0442><U+0438><U+0449><U+0438> <U+0422><U+0420><U+041A> "XL-3" | 5027 |
| 34 | <U+041D>.<U+041D><U+043E><U+0432><U+0433><U+043E><U+0440><U+043E><U+0434> <U+0422><U+0420><U+0426> "<U+0420><U+0418><U+041E>" | 5752 |
| 35 | <U+041D>.<U+041D><U+043E><U+0432><U+0433><U+043E><U+0440><U+043E><U+0434> <U+0422><U+0420><U+0426> "<U+0424><U+0430><U+043D><U+0442><U+0430><U+0441><U+0442><U+0438><U+043A><U+0430>" | 58445 |
| 36 | <U+041D><U+043E><U+0432><U+043E><U+0441><U+0438><U+0431><U+0438><U+0440><U+0441><U+043A> <U+0422><U+0420><U+0426> "<U+0413><U+0430><U+043B><U+0435><U+0440><U+0435><U+044F> <U+041D><U+043E><U+0432><U+043E><U+0441><U+0438><U+0431><U+0438><U+0440><U+0441><U+043A>" | 306 |
| 37 | <U+041D><U+043E><U+0432><U+043E><U+0441><U+0438><U+0431><U+0438><U+0440><U+0441><U+043A> <U+0422><U+0426> "<U+041C><U+0435><U+0433><U+0430>" | 39638 |
| 38 | <U+041E><U+043C><U+0441><U+043A> <U+0422><U+0426> "<U+041C><U+0435><U+0433><U+0430>" | 46013 |
| 39 | <U+0420><U+043E><U+0441><U+0442><U+043E><U+0432><U+041D><U+0430><U+0414><U+043E><U+043D><U+0443> <U+0422><U+0420><U+041A> "<U+041C><U+0435><U+0433><U+0430><U+0446><U+0435><U+043D><U+0442><U+0440> <U+0413><U+043E><U+0440><U+0438><U+0437><U+043E><U+043D><U+0442>" | 13440 |
| 40 | <U+0420><U+043E><U+0441><U+0442><U+043E><U+0432><U+041D><U+0430><U+0414><U+043E><U+043D><U+0443> <U+0422><U+0420><U+041A> "<U+041C><U+0435><U+0433><U+0430><U+0446><U+0435><U+043D><U+0442><U+0440> <U+0413><U+043E><U+0440><U+0438><U+0437><U+043E><U+043D><U+0442>" <U+041E><U+0441><U+0442><U+0440><U+043E><U+0432><U+043D><U+043E><U+0439> | 4257 |
| 41 | <U+0420><U+043E><U+0441><U+0442><U+043E><U+0432><U+041D><U+0430><U+0414><U+043E><U+043D><U+0443> <U+0422><U+0426> "<U+041C><U+0435><U+0433><U+0430>" | 41967 |
| 42 | <U+0421><U+041F><U+0431> <U+0422><U+041A> "<U+041D><U+0435><U+0432><U+0441><U+043A><U+0438><U+0439> <U+0426><U+0435><U+043D><U+0442><U+0440>" | 109253 |
| 43 | <U+0421><U+041F><U+0431> <U+0422><U+041A> "<U+0421><U+0435><U+043D><U+043D><U+0430><U+044F>" | 39282 |
| 44 | <U+0421><U+0430><U+043C><U+0430><U+0440><U+0430> <U+0422><U+0426> "<U+041C><U+0435><U+043B><U+043E><U+0434><U+0438><U+044F>" | 39530 |
| 45 | <U+0421><U+0430><U+043C><U+0430><U+0440><U+0430> <U+0422><U+0426> "<U+041F><U+0430><U+0440><U+043A><U+0425><U+0430><U+0443><U+0441>" | 35891 |
| 46 | <U+0421><U+0435><U+0440><U+0433><U+0438><U+0435><U+0432> <U+041F><U+043E><U+0441><U+0430><U+0434> <U+0422><U+0426> "7<U+042F>" | 66321 |
| 47 | <U+0421><U+0443><U+0440><U+0433><U+0443><U+0442> <U+0422><U+0420><U+0426> "<U+0421><U+0438><U+0442><U+0438> <U+041C><U+043E><U+043B><U+043B>" | 56695 |
| 48 | <U+0422><U+043E><U+043C><U+0441><U+043A> <U+0422><U+0420><U+0426> "<U+0418><U+0437><U+0443><U+043C><U+0440><U+0443><U+0434><U+043D><U+044B><U+0439> <U+0413><U+043E><U+0440><U+043E><U+0434>" | 21612 |
| 49 | <U+0422><U+044E><U+043C><U+0435><U+043D><U+044C> <U+0422><U+0420><U+0426> "<U+041A><U+0440><U+0438><U+0441><U+0442><U+0430><U+043B><U+043B>" | 15849 |
| 50 | <U+0422><U+044E><U+043C><U+0435><U+043D><U+044C> <U+0422><U+0426> "<U+0413><U+0443><U+0434><U+0432><U+0438><U+043D>" | 65173 |
| 51 | <U+0422><U+044E><U+043C><U+0435><U+043D><U+044C> <U+0422><U+0426> "<U+0417><U+0435><U+043B><U+0435><U+043D><U+044B><U+0439> <U+0411><U+0435><U+0440><U+0435><U+0433>" | 44433 |
| 52 | <U+0423><U+0444><U+0430> <U+0422><U+041A> "<U+0426><U+0435><U+043D><U+0442><U+0440><U+0430><U+043B><U+044C><U+043D><U+044B><U+0439>" | 43502 |
| 53 | <U+0423><U+0444><U+0430> <U+0422><U+0426> "<U+0421><U+0435><U+043C><U+044C><U+044F>" 2 | 52921 |
| 54 | <U+0425><U+0438><U+043C><U+043A><U+0438> <U+0422><U+0426> "<U+041C><U+0435><U+0433><U+0430>" | 143480 |
| 55 | <U+0426><U+0438><U+0444><U+0440><U+043E><U+0432><U+043E><U+0439> <U+0441><U+043A><U+043B><U+0430><U+0434> 1<U+0421>-<U+041E><U+043D><U+043B><U+0430><U+0439><U+043D> | 34769 |
| 56 | <U+0427><U+0435><U+0445><U+043E><U+0432> <U+0422><U+0420><U+0426> "<U+041A><U+0430><U+0440><U+043D><U+0430><U+0432><U+0430><U+043B>" | 69573 |
| 57 | <U+042F><U+043A><U+0443><U+0442><U+0441><U+043A> <U+041E><U+0440><U+0434><U+0436><U+043E><U+043D><U+0438><U+043A><U+0438><U+0434><U+0437><U+0435>, 56 | 117428 |
| 58 | <U+042F><U+043A><U+0443><U+0442><U+0441><U+043A> <U+0422><U+0426> "<U+0426><U+0435><U+043D><U+0442><U+0440><U+0430><U+043B><U+044C><U+043D><U+044B><U+0439>" | 71441 |
| 59 | <U+042F><U+0440><U+043E><U+0441><U+043B><U+0430><U+0432><U+043B><U+044C> <U+0422><U+0426> "<U+0410><U+043B><U+044C><U+0442><U+0430><U+0438><U+0440>" | 42108 |
# The will give the most popular shop (i.e. by sales volume)
popular_shop = merged_sales %>%
group_by(shop_id,shop_name) %>%
summarise(total = sum(item_cnt_day)) %>%
arrange(desc(total))
popular_shop
`summarise()` regrouping output by 'shop_id' (override with `.groups` argument)
| shop_id | shop_name | total |
|---|---|---|
| <fct> | <chr> | <dbl> |
| 31 | <U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0426> "<U+0421><U+0435><U+043C><U+0435><U+043D><U+043E><U+0432><U+0441><U+043A><U+0438><U+0439>" | 310777 |
| 25 | <U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0420><U+041A> "<U+0410><U+0442><U+0440><U+0438><U+0443><U+043C>" | 241920 |
| 54 | <U+0425><U+0438><U+043C><U+043A><U+0438> <U+0422><U+0426> "<U+041C><U+0435><U+0433><U+0430>" | 185790 |
| 28 | <U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0426> "<U+041C><U+0415><U+0413><U+0410> <U+0422><U+0435><U+043F><U+043B><U+044B><U+0439> <U+0421><U+0442><U+0430><U+043D>" II | 184557 |
| 42 | <U+0421><U+041F><U+0431> <U+0422><U+041A> "<U+041D><U+0435><U+0432><U+0441><U+043A><U+0438><U+0439> <U+0426><U+0435><U+043D><U+0442><U+0440>" | 144934 |
| 57 | <U+042F><U+043A><U+0443><U+0442><U+0441><U+043A> <U+041E><U+0440><U+0434><U+0436><U+043E><U+043D><U+0438><U+043A><U+0438><U+0434><U+0437><U+0435>, 56 | 141107 |
| 27 | <U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0426> "<U+041C><U+0415><U+0413><U+0410> <U+0411><U+0435><U+043B><U+0430><U+044F> <U+0414><U+0430><U+0447><U+0430> II" | 136657 |
| 6 | <U+0412><U+043E><U+0440><U+043E><U+043D><U+0435><U+0436> (<U+041F><U+043B><U+0435><U+0445><U+0430><U+043D><U+043E><U+0432><U+0441><U+043A><U+0430><U+044F>, 13) | 100489 |
| 58 | <U+042F><U+043A><U+0443><U+0442><U+0441><U+043A> <U+0422><U+0426> "<U+0426><U+0435><U+043D><U+0442><U+0440><U+0430><U+043B><U+044C><U+043D><U+044B><U+0439>" | 81734 |
| 46 | <U+0421><U+0435><U+0440><U+0433><U+0438><U+0435><U+0432> <U+041F><U+043E><U+0441><U+0430><U+0434> <U+0422><U+0426> "7<U+042F>" | 78990 |
| 56 | <U+0427><U+0435><U+0445><U+043E><U+0432> <U+0422><U+0420><U+0426> "<U+041A><U+0430><U+0440><U+043D><U+0430><U+0432><U+0430><U+043B>" | 78079 |
| 50 | <U+0422><U+044E><U+043C><U+0435><U+043D><U+044C> <U+0422><U+0426> "<U+0413><U+0443><U+0434><U+0432><U+0438><U+043D>" | 76238 |
| 12 | <U+0418><U+043D><U+0442><U+0435><U+0440><U+043D><U+0435><U+0442>-<U+043C><U+0430><U+0433><U+0430><U+0437><U+0438><U+043D> <U+0427><U+0421> | 73478 |
| 19 | <U+041A><U+0443><U+0440><U+0441><U+043A> <U+0422><U+0426> "<U+041F><U+0443><U+0448><U+043A><U+0438><U+043D><U+0441><U+043A><U+0438><U+0439>" | 73455 |
| 15 | <U+041A><U+0430><U+043B><U+0443><U+0433><U+0430> <U+0422><U+0420><U+0426> "XXI <U+0432><U+0435><U+043A>" | 71201 |
| 35 | <U+041D>.<U+041D><U+043E><U+0432><U+0433><U+043E><U+0440><U+043E><U+0434> <U+0422><U+0420><U+0426> "<U+0424><U+0430><U+043D><U+0442><U+0430><U+0441><U+0442><U+0438><U+043A><U+0430>" | 69016 |
| 21 | <U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+041C><U+0422><U+0420><U+0426> "<U+0410><U+0444><U+0438> <U+041C><U+043E><U+043B><U+043B>" | 68560 |
| 26 | <U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0426> "<U+0410><U+0440><U+0435><U+0430><U+043B>" (<U+0411><U+0435><U+043B><U+044F><U+0435><U+0432><U+043E>) | 67890 |
| 47 | <U+0421><U+0443><U+0440><U+0433><U+0443><U+0442> <U+0422><U+0420><U+0426> "<U+0421><U+0438><U+0442><U+0438> <U+041C><U+043E><U+043B><U+043B>" | 67637 |
| 7 | <U+0412><U+043E><U+0440><U+043E><U+043D><U+0435><U+0436> <U+0422><U+0420><U+0426> "<U+041C><U+0430><U+043A><U+0441><U+0438><U+043C><U+0438><U+0440>" | 67058 |
| 18 | <U+041A><U+0440><U+0430><U+0441><U+043D><U+043E><U+044F><U+0440><U+0441><U+043A> <U+0422><U+0426> "<U+0418><U+044E><U+043D><U+044C>" | 65486 |
| 24 | <U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+041A> "<U+0411><U+0443><U+0434><U+0435><U+043D><U+043E><U+0432><U+0441><U+043A><U+0438><U+0439>" (<U+043F><U+0430><U+0432>.<U+041A>7) | 63886 |
| 55 | <U+0426><U+0438><U+0444><U+0440><U+043E><U+0432><U+043E><U+0439> <U+0441><U+043A><U+043B><U+0430><U+0434> 1<U+0421>-<U+041E><U+043D><U+043B><U+0430><U+0439><U+043D> | 63388 |
| 53 | <U+0423><U+0444><U+0430> <U+0422><U+0426> "<U+0421><U+0435><U+043C><U+044C><U+044F>" 2 | 61657 |
| 16 | <U+041A><U+043E><U+043B><U+043E><U+043C><U+043D><U+0430> <U+0422><U+0426> "<U+0420><U+0438><U+043E>" | 61633 |
| 30 | <U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0426> "<U+041F><U+0435><U+0440><U+043B><U+043E><U+0432><U+0441><U+043A><U+0438><U+0439>" | 60828 |
| 22 | <U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+041C><U+0430><U+0433><U+0430><U+0437><U+0438><U+043D> <U+0421>21 | 60230 |
| 29 | <U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0426> "<U+041D><U+043E><U+0432><U+044B><U+0439> <U+0432><U+0435><U+043A>" (<U+041D><U+043E><U+0432><U+043E><U+043A><U+043E><U+0441><U+0438><U+043D><U+043E>) | 58713 |
| 38 | <U+041E><U+043C><U+0441><U+043A> <U+0422><U+0426> "<U+041C><U+0435><U+0433><U+0430>" | 53886 |
| 43 | <U+0421><U+041F><U+0431> <U+0422><U+041A> "<U+0421><U+0435><U+043D><U+043D><U+0430><U+044F>" | 50608 |
| 52 | <U+0423><U+0444><U+0430> <U+0422><U+041A> "<U+0426><U+0435><U+043D><U+0442><U+0440><U+0430><U+043B><U+044C><U+043D><U+044B><U+0439>" | 49744 |
| 41 | <U+0420><U+043E><U+0441><U+0442><U+043E><U+0432><U+041D><U+0430><U+0414><U+043E><U+043D><U+0443> <U+0422><U+0426> "<U+041C><U+0435><U+0433><U+0430>" | 49324 |
| 59 | <U+042F><U+0440><U+043E><U+0441><U+043B><U+0430><U+0432><U+043B><U+044C> <U+0422><U+0426> "<U+0410><U+043B><U+044C><U+0442><U+0430><U+0438><U+0440>" | 48993 |
| 51 | <U+0422><U+044E><U+043C><U+0435><U+043D><U+044C> <U+0422><U+0426> "<U+0417><U+0435><U+043B><U+0435><U+043D><U+044B><U+0439> <U+0411><U+0435><U+0440><U+0435><U+0433>" | 48767 |
| 14 | <U+041A><U+0430><U+0437><U+0430><U+043D><U+044C> <U+0422><U+0426> "<U+041F><U+0430><U+0440><U+043A><U+0425><U+0430><U+0443><U+0441>" II | 46375 |
| 37 | <U+041D><U+043E><U+0432><U+043E><U+0441><U+0438><U+0431><U+0438><U+0440><U+0441><U+043A> <U+0422><U+0426> "<U+041C><U+0435><U+0433><U+0430>" | 46256 |
| 44 | <U+0421><U+0430><U+043C><U+0430><U+0440><U+0430> <U+0422><U+0426> "<U+041C><U+0435><U+043B><U+043E><U+0434><U+0438><U+044F>" | 44938 |
| 4 | <U+0412><U+043E><U+043B><U+0436><U+0441><U+043A><U+0438><U+0439> <U+0422><U+0426> "<U+0412><U+043E><U+043B><U+0433><U+0430> <U+041C><U+043E><U+043B><U+043B>" | 43942 |
| 5 | <U+0412><U+043E><U+043B><U+043E><U+0433><U+0434><U+0430> <U+0422><U+0420><U+0426> "<U+041C><U+0430><U+0440><U+043C><U+0435><U+043B><U+0430><U+0434>" | 42762 |
| 45 | <U+0421><U+0430><U+043C><U+0430><U+0440><U+0430> <U+0422><U+0426> "<U+041F><U+0430><U+0440><U+043A><U+0425><U+0430><U+0443><U+0441>" | 41895 |
| 2 | <U+0410><U+0434><U+044B><U+0433><U+0435><U+044F> <U+0422><U+0426> "<U+041C><U+0435><U+0433><U+0430>" | 30620 |
| 3 | <U+0411><U+0430><U+043B><U+0430><U+0448><U+0438><U+0445><U+0430> <U+0422><U+0420><U+041A> "<U+041E><U+043A><U+0442><U+044F><U+0431><U+0440><U+044C>-<U+041A><U+0438><U+043D><U+043E><U+043C><U+0438><U+0440>" | 28355 |
| 17 | <U+041A><U+0440><U+0430><U+0441><U+043D><U+043E><U+044F><U+0440><U+0441><U+043A> <U+0422><U+0426> "<U+0412><U+0437><U+043B><U+0435><U+0442><U+043A><U+0430> <U+041F><U+043B><U+0430><U+0437><U+0430>" | 25838 |
| 48 | <U+0422><U+043E><U+043C><U+0441><U+043A> <U+0422><U+0420><U+0426> "<U+0418><U+0437><U+0443><U+043C><U+0440><U+0443><U+0434><U+043D><U+044B><U+0439> <U+0413><U+043E><U+0440><U+043E><U+0434>" | 24909 |
| 10 | <U+0416><U+0443><U+043A><U+043E><U+0432><U+0441><U+043A><U+0438><U+0439> <U+0443><U+043B>. <U+0427><U+043A><U+0430><U+043B><U+043E><U+0432><U+0430> 39<U+043C>? | 24523 |
| 13 | <U+041A><U+0430><U+0437><U+0430><U+043D><U+044C> <U+0422><U+0426> "<U+0411><U+0435><U+0445><U+0435><U+0442><U+043B><U+0435>" | 19763 |
| 49 | <U+0422><U+044E><U+043C><U+0435><U+043D><U+044C> <U+0422><U+0420><U+0426> "<U+041A><U+0440><U+0438><U+0441><U+0442><U+0430><U+043B><U+043B>" | 17090 |
| 39 | <U+0420><U+043E><U+0441><U+0442><U+043E><U+0432><U+041D><U+0430><U+0414><U+043E><U+043D><U+0443> <U+0422><U+0420><U+041A> "<U+041C><U+0435><U+0433><U+0430><U+0446><U+0435><U+043D><U+0442><U+0440> <U+0413><U+043E><U+0440><U+0438><U+0437><U+043E><U+043D><U+0442>" | 16617 |
| 9 | <U+0412><U+044B><U+0435><U+0437><U+0434><U+043D><U+0430><U+044F> <U+0422><U+043E><U+0440><U+0433><U+043E><U+0432><U+043B><U+044F> | 15866 |
| 0 | !<U+042F><U+043A><U+0443><U+0442><U+0441><U+043A> <U+041E><U+0440><U+0434><U+0436><U+043E><U+043D><U+0438><U+043A><U+0438><U+0434><U+0437><U+0435>, 56 <U+0444><U+0440><U+0430><U+043D> | 11705 |
| 32 | <U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0426> "<U+0421><U+0435><U+0440><U+0435><U+0431><U+0440><U+044F><U+043D><U+044B><U+0439> <U+0414><U+043E><U+043C>" | 8781 |
| 23 | <U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+041A> "<U+0411><U+0443><U+0434><U+0435><U+043D><U+043E><U+0432><U+0441><U+043A><U+0438><U+0439>" (<U+043F><U+0430><U+0432>.<U+0410>2) | 7705 |
| 34 | <U+041D>.<U+041D><U+043E><U+0432><U+0433><U+043E><U+0440><U+043E><U+0434> <U+0422><U+0420><U+0426> "<U+0420><U+0418><U+041E>" | 6451 |
| 1 | !<U+042F><U+043A><U+0443><U+0442><U+0441><U+043A> <U+0422><U+0426> "<U+0426><U+0435><U+043D><U+0442><U+0440><U+0430><U+043B><U+044C><U+043D><U+044B><U+0439>" <U+0444><U+0440><U+0430><U+043D> | 6311 |
| 20 | <U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> "<U+0420><U+0430><U+0441><U+043F><U+0440><U+043E><U+0434><U+0430><U+0436><U+0430>" | 5872 |
| 33 | <U+041C><U+044B><U+0442><U+0438><U+0449><U+0438> <U+0422><U+0420><U+041A> "XL-3" | 5482 |
| 40 | <U+0420><U+043E><U+0441><U+0442><U+043E><U+0432><U+041D><U+0430><U+0414><U+043E><U+043D><U+0443> <U+0422><U+0420><U+041A> "<U+041C><U+0435><U+0433><U+0430><U+0446><U+0435><U+043D><U+0442><U+0440> <U+0413><U+043E><U+0440><U+0438><U+0437><U+043E><U+043D><U+0442>" <U+041E><U+0441><U+0442><U+0440><U+043E><U+0432><U+043D><U+043E><U+0439> | 4943 |
| 8 | <U+0412><U+043E><U+0440><U+043E><U+043D><U+0435><U+0436> <U+0422><U+0420><U+0426> <U+0421><U+0438><U+0442><U+0438>-<U+041F><U+0430><U+0440><U+043A> "<U+0413><U+0440><U+0430><U+0434>" | 3595 |
| 11 | <U+0416><U+0443><U+043A><U+043E><U+0432><U+0441><U+043A><U+0438><U+0439> <U+0443><U+043B>. <U+0427><U+043A><U+0430><U+043B><U+043E><U+0432><U+0430> 39<U+043C>² | 572 |
| 36 | <U+041D><U+043E><U+0432><U+043E><U+0441><U+0438><U+0431><U+0438><U+0440><U+0441><U+043A> <U+0422><U+0420><U+0426> "<U+0413><U+0430><U+043B><U+0435><U+0440><U+0435><U+044F> <U+041D><U+043E><U+0432><U+043E><U+0441><U+0438><U+0431><U+0438><U+0440><U+0441><U+043A>" | 330 |
library(plotly)
Attaching package: 'plotly'
The following object is masked from 'package:ggplot2':
last_plot
The following object is masked from 'package:stats':
filter
The following object is masked from 'package:graphics':
layout
# Plot for the shop id vs total quantity sold
plot_ly(popular_shop, x=~total, y=~reorder(as.factor(shop_id) , total),
type="bar", color=~shop_id, hoverinfo='text', text=~paste(shop_name,':',total), orientation='h') %>%
layout(title="Most Popular Shop by Total Quantity Sold.", xaxis=list(title="Total Quantity Sold"),
yaxis=list(title="Shop ID"),legend=list(title=list(text='<b> Shop ID </b>')))
# Unique number of items
unique_items = merged_sales %>%
distinct(item_id) %>%
summarise(items=n())
unique_items
| items |
|---|
| <int> |
| 21807 |
item_freq = merged_sales %>%
group_by(item_id,item_name) %>%
summarise(freq =n()) %>%
arrange(desc(freq))
head(item_freq)
`summarise()` regrouping output by 'item_id' (override with `.groups` argument)
| item_id | item_name | freq |
|---|---|---|
| <fct> | <chr> | <int> |
| 20949 | <U+0424><U+0438><U+0440><U+043C><U+0435><U+043D><U+043D><U+044B><U+0439> <U+043F><U+0430><U+043A><U+0435><U+0442> <U+043C><U+0430><U+0439><U+043A><U+0430> 1<U+0421> <U+0418><U+043D><U+0442><U+0435><U+0440><U+0435><U+0441> <U+0431><U+0435><U+043B><U+044B><U+0439> (34*42) 45 <U+043C><U+043A><U+043C> | 31340 |
| 5822 | Playstation Store <U+043F><U+043E><U+043F><U+043E><U+043B><U+043D><U+0435><U+043D><U+0438><U+0435> <U+0431><U+0443><U+043C><U+0430><U+0436><U+043D><U+0438><U+043A><U+0430>: <U+041A><U+0430><U+0440><U+0442><U+0430> <U+043E><U+043F><U+043B><U+0430><U+0442><U+044B> 1000 <U+0440><U+0443><U+0431>. | 9408 |
| 17717 | <U+041F><U+0440><U+0438><U+0435><U+043C> <U+0434><U+0435><U+043D><U+0435><U+0436><U+043D><U+044B><U+0445> <U+0441><U+0440><U+0435><U+0434><U+0441><U+0442><U+0432> <U+0434><U+043B><U+044F> 1<U+0421>-<U+041E><U+043D><U+043B><U+0430><U+0439><U+043D> | 9067 |
| 2808 | Diablo III [PC, Jewel, <U+0440><U+0443><U+0441><U+0441><U+043A><U+0430><U+044F> <U+0432><U+0435><U+0440><U+0441><U+0438><U+044F>] | 7479 |
| 4181 | Kaspersky Internet Security Multi-Device Russian Edition. 2-Device 1 year Renewal Box | 6853 |
| 7856 | World of Warcraft. <U+041A><U+0430><U+0440><U+0442><U+0430> <U+043E><U+043F><U+043B><U+0430><U+0442><U+044B> <U+0438><U+0433><U+0440><U+043E><U+0432><U+043E><U+0433><U+043E> <U+0432><U+0440><U+0435><U+043C><U+0435><U+043D><U+0438> (online) (<U+0440><U+0443><U+0441>.<U+0432>.) (60 <U+0434><U+043D><U+0435><U+0439>) (Jewel) | 6602 |
# Most sold item for each shop
popular_item = merged_sales %>%
group_by(shop_id, item_id) %>%
summarise(quantity_sold = sum(item_cnt_day)) %>%
filter(quantity_sold == max(quantity_sold))
popular_item
`summarise()` regrouping output by 'shop_id' (override with `.groups` argument)
| shop_id | item_id | quantity_sold |
|---|---|---|
| <fct> | <fct> | <dbl> |
| 0 | 19811 | 79 |
| 1 | 13354 | 62 |
| 2 | 20949 | 1319 |
| 3 | 20949 | 1137 |
| 4 | 20949 | 2465 |
| 5 | 20949 | 2163 |
| 6 | 20949 | 3551 |
| 7 | 20949 | 3321 |
| 8 | 12168 | 31 |
| 9 | 7096 | 305 |
| 10 | 20949 | 1413 |
| 11 | 20949 | 46 |
| 12 | 11373 | 6563 |
| 13 | 20949 | 1023 |
| 14 | 20949 | 3219 |
| 15 | 20949 | 2907 |
| 16 | 20949 | 3395 |
| 17 | 20949 | 1377 |
| 18 | 20949 | 2826 |
| 19 | 20949 | 2347 |
| 20 | 15275 | 141 |
| 21 | 20949 | 4335 |
| 22 | 20949 | 4382 |
| 23 | 4164 | 89 |
| 24 | 20949 | 2918 |
| 25 | 20949 | 14343 |
| 26 | 20949 | 3624 |
| 27 | 20949 | 8265 |
| 28 | 20949 | 13458 |
| 29 | 20949 | 2167 |
| 30 | 20949 | 3922 |
| 31 | 20949 | 19934 |
| 32 | 2808 | 178 |
| 33 | 20949 | 379 |
| 34 | 20949 | 328 |
| 35 | 20949 | 3434 |
| 36 | 20949 | 16 |
| 37 | 20949 | 2727 |
| 38 | 20949 | 2096 |
| 39 | 20949 | 1867 |
| 40 | 20949 | 648 |
| 41 | 20949 | 2100 |
| 42 | 20949 | 10514 |
| 43 | 20949 | 2266 |
| 44 | 20949 | 1060 |
| 45 | 20949 | 1404 |
| 46 | 20949 | 3891 |
| 47 | 20949 | 4530 |
| 48 | 20949 | 1619 |
| 49 | 20949 | 722 |
| 50 | 20949 | 2544 |
| 51 | 20949 | 1074 |
| 52 | 20949 | 1920 |
| 53 | 20949 | 3540 |
| 54 | 20949 | 12323 |
| 55 | 7967 | 2365 |
| 56 | 20949 | 3439 |
| 57 | 20949 | 6116 |
| 58 | 20949 | 2928 |
| 59 | 20949 | 2225 |
# Plot for the most quantity sold for each shop
plot_ly(popular_item,y=~shop_id, x=~quantity_sold,
type="bar", color=~as.factor(item_id), orientation='h', hoverinfo='text', text=~paste('Shop',shop_id,':','Item',item_id)) %>%
layout(title="Most Popular Item per Shop", yaxis=list(title="Shop"), xaxis=list(title="Quantity Sold"), legend=list(title=list(text='<b> Item ID </b>')))
# Repetition of categories in the data set
item_cat_freq = merged_sales %>%
group_by(item_category_id,item_category_name) %>%
summarise(freq =n()) %>%
arrange(desc(freq))
head(item_cat_freq)
`summarise()` regrouping output by 'item_category_id' (override with `.groups` argument)
| item_category_id | item_category_name | freq |
|---|---|---|
| <fct> | <chr> | <int> |
| 40 | <U+041A><U+0438><U+043D><U+043E> - DVD | 564652 |
| 30 | <U+0418><U+0433><U+0440><U+044B> PC - <U+0421><U+0442><U+0430><U+043D><U+0434><U+0430><U+0440><U+0442><U+043D><U+044B><U+0435> <U+0438><U+0437><U+0434><U+0430><U+043D><U+0438><U+044F> | 351591 |
| 55 | <U+041C><U+0443><U+0437><U+044B><U+043A><U+0430> - CD <U+043B><U+043E><U+043A><U+0430><U+043B><U+044C><U+043D><U+043E><U+0433><U+043E> <U+043F><U+0440><U+043E><U+0438><U+0437><U+0432><U+043E><U+0434><U+0441><U+0442><U+0432><U+0430> | 339585 |
| 19 | <U+0418><U+0433><U+0440><U+044B> - PS3 | 208219 |
| 37 | <U+041A><U+0438><U+043D><U+043E> - Blu-Ray | 192674 |
| 23 | <U+0418><U+0433><U+0440><U+044B> - XBOX 360 | 146789 |
# Find number of unique categories
item_cat_unique = merged_sales %>%
select(item_category_id) %>%
distinct() %>%
count()
item_cat_unique
| n |
|---|
| <int> |
| 84 |
# Find number of categories per shop
categories_shop = merged_sales %>%
group_by(shop_id) %>%
distinct(item_category_id) %>%
summarise(no_distinct_categories = n())
categories_shop
`summarise()` ungrouping output (override with `.groups` argument)
| shop_id | no_distinct_categories |
|---|---|
| <fct> | <int> |
| 0 | 48 |
| 1 | 41 |
| 2 | 59 |
| 3 | 58 |
| 4 | 61 |
| 5 | 60 |
| 6 | 63 |
| 7 | 60 |
| 8 | 43 |
| 9 | 38 |
| 10 | 57 |
| 11 | 35 |
| 12 | 64 |
| 13 | 39 |
| 14 | 60 |
| 15 | 59 |
| 16 | 63 |
| 17 | 55 |
| 18 | 62 |
| 19 | 60 |
| 20 | 12 |
| 21 | 61 |
| 22 | 61 |
| 23 | 44 |
| 24 | 58 |
| 25 | 65 |
| 26 | 61 |
| 27 | 60 |
| 28 | 60 |
| 29 | 60 |
| 30 | 59 |
| 31 | 62 |
| 32 | 47 |
| 33 | 47 |
| 34 | 49 |
| 35 | 62 |
| 36 | 35 |
| 37 | 63 |
| 38 | 63 |
| 39 | 56 |
| 40 | 26 |
| 41 | 60 |
| 42 | 62 |
| 43 | 57 |
| 44 | 62 |
| 45 | 59 |
| 46 | 60 |
| 47 | 60 |
| 48 | 52 |
| 49 | 54 |
| 50 | 59 |
| 51 | 56 |
| 52 | 61 |
| 53 | 60 |
| 54 | 59 |
| 55 | 12 |
| 56 | 63 |
| 57 | 61 |
| 58 | 58 |
| 59 | 60 |
# Plot for shop vs number of categories for each shop
plot_ly(categories_shop, x=~no_distinct_categories, y=~reorder(as.factor(shop_id), no_distinct_categories),
type='bar', orientation='h', color=~as.factor(shop_id), hoverinfo='text', text=~paste('Shop',shop_id,':',no_distinct_categories,'Categories')) %>%
layout(title= "Number of categories per shop", xaxis=list(title= "Number of distinct categories"), yaxis=list(title= "Shops"), showlegend=FALSE)
# Find top 10 selling products
top10_prod = merged_sales %>%
group_by(item_id,item_name) %>%
summarise(qty_vol = sum(item_cnt_day)) %>%
arrange(desc(qty_vol)) %>%
ungroup() %>%
top_n(10)
top10_prod
`summarise()` regrouping output by 'item_id' (override with `.groups` argument) Selecting by qty_vol
| item_id | item_name | qty_vol |
|---|---|---|
| <fct> | <chr> | <dbl> |
| 20949 | <U+0424><U+0438><U+0440><U+043C><U+0435><U+043D><U+043D><U+044B><U+0439> <U+043F><U+0430><U+043A><U+0435><U+0442> <U+043C><U+0430><U+0439><U+043A><U+0430> 1<U+0421> <U+0418><U+043D><U+0442><U+0435><U+0440><U+0435><U+0441> <U+0431><U+0435><U+043B><U+044B><U+0439> (34*42) 45 <U+043C><U+043A><U+043C> | 187642 |
| 2808 | Diablo III [PC, Jewel, <U+0440><U+0443><U+0441><U+0441><U+043A><U+0430><U+044F> <U+0432><U+0435><U+0440><U+0441><U+0438><U+044F>] | 17245 |
| 3732 | Grand Theft Auto V [PS3, <U+0440><U+0443><U+0441><U+0441><U+043A><U+0438><U+0435> <U+0441><U+0443><U+0431><U+0442><U+0438><U+0442><U+0440><U+044B>] | 16642 |
| 17717 | <U+041F><U+0440><U+0438><U+0435><U+043C> <U+0434><U+0435><U+043D><U+0435><U+0436><U+043D><U+044B><U+0445> <U+0441><U+0440><U+0435><U+0434><U+0441><U+0442><U+0432> <U+0434><U+043B><U+044F> 1<U+0421>-<U+041E><U+043D><U+043B><U+0430><U+0439><U+043D> | 15830 |
| 5822 | Playstation Store <U+043F><U+043E><U+043F><U+043E><U+043B><U+043D><U+0435><U+043D><U+0438><U+0435> <U+0431><U+0443><U+043C><U+0430><U+0436><U+043D><U+0438><U+043A><U+0430>: <U+041A><U+0430><U+0440><U+0442><U+0430> <U+043E><U+043F><U+043B><U+0430><U+0442><U+044B> 1000 <U+0440><U+0443><U+0431>. | 14515 |
| 3734 | Grand Theft Auto V [Xbox 360, <U+0440><U+0443><U+0441><U+0441><U+043A><U+0438><U+0435> <U+0441><U+0443><U+0431><U+0442><U+0438><U+0442><U+0440><U+044B>] | 11688 |
| 6675 | Sony PlayStation 4 (500 Gb) Black (CUH-1008A/1108A/B01) | 10289 |
| 3731 | Grand Theft Auto V [PC, <U+0440><U+0443><U+0441><U+0441><U+043A><U+0438><U+0435> <U+0441><U+0443><U+0431><U+0442><U+0438><U+0442><U+0440><U+044B>] | 10099 |
| 1855 | Battlefield 4 [PC, <U+0440><U+0443><U+0441><U+0441><U+043A><U+0430><U+044F> <U+0432><U+0435><U+0440><U+0441><U+0438><U+044F>] | 10032 |
| 16787 | <U+041E><U+0434><U+043D><U+0438> <U+0438><U+0437> <U+043D><U+0430><U+0441> [PS3, <U+0440><U+0443><U+0441><U+0441><U+043A><U+0430><U+044F> <U+0432><U+0435><U+0440><U+0441><U+0438><U+044F>] | 9227 |
# Plot for the quantity vs item for the top 10 selling items
plot_ly(top10_prod, x=~item_name, y=~qty_vol, type='bar', color=~as.factor(item_id), hoverinfo='text', text=~paste(item_name,':',qty_vol)) %>%
layout(title="Top 10 Selling Products", xaxis=list(title="Item ID"), yaxis=list(title="Quantity Sold"), showlegend=FALSE)
# Find top 10 selling categories
top10_cat = merged_sales %>%
group_by(item_category_id,item_category_name) %>%
summarise(qty_vol = sum(item_cnt_day)) %>%
arrange(desc(qty_vol)) %>%
ungroup() %>%
top_n(10)
top10_cat
`summarise()` regrouping output by 'item_category_id' (override with `.groups` argument) Selecting by qty_vol
| item_category_id | item_category_name | qty_vol |
|---|---|---|
| <fct> | <chr> | <dbl> |
| 40 | <U+041A><U+0438><U+043D><U+043E> - DVD | 634171 |
| 30 | <U+0418><U+0433><U+0440><U+044B> PC - <U+0421><U+0442><U+0430><U+043D><U+0434><U+0430><U+0440><U+0442><U+043D><U+044B><U+0435> <U+0438><U+0437><U+0434><U+0430><U+043D><U+0438><U+044F> | 456540 |
| 55 | <U+041C><U+0443><U+0437><U+044B><U+043A><U+0430> - CD <U+043B><U+043E><U+043A><U+0430><U+043B><U+044C><U+043D><U+043E><U+0433><U+043E> <U+043F><U+0440><U+043E><U+0438><U+0437><U+0432><U+043E><U+0434><U+0441><U+0442><U+0432><U+0430> | 348591 |
| 19 | <U+0418><U+0433><U+0440><U+044B> - PS3 | 254887 |
| 37 | <U+041A><U+0438><U+043D><U+043E> - Blu-Ray | 203284 |
| 71 | <U+041F><U+043E><U+0434><U+0430><U+0440><U+043A><U+0438> - <U+0421><U+0443><U+043C><U+043A><U+0438>, <U+0410><U+043B><U+044C><U+0431><U+043E><U+043C><U+044B>, <U+041A><U+043E><U+0432><U+0440><U+0438><U+043A><U+0438> <U+0434>/<U+043C><U+044B><U+0448><U+0438> | 187998 |
| 28 | <U+0418><U+0433><U+0440><U+044B> PC - <U+0414><U+043E><U+043F><U+043E><U+043B><U+043D><U+0438><U+0442><U+0435><U+043B><U+044C><U+043D><U+044B><U+0435> <U+0438><U+0437><U+0434><U+0430><U+043D><U+0438><U+044F> | 174954 |
| 23 | <U+0418><U+0433><U+0440><U+044B> - XBOX 360 | 169944 |
| 20 | <U+0418><U+0433><U+0440><U+044B> - PS4 | 127319 |
| 65 | <U+041F><U+043E><U+0434><U+0430><U+0440><U+043A><U+0438> - <U+041D><U+0430><U+0441><U+0442><U+043E><U+043B><U+044C><U+043D><U+044B><U+0435> <U+0438><U+0433><U+0440><U+044B> (<U+043A><U+043E><U+043C><U+043F><U+0430><U+043A><U+0442><U+043D><U+044B><U+0435>) | 73077 |
# Plot for quantity vs categories for the top 10 selling categories
plot_ly(top10_cat, x=~item_category_name, y=~qty_vol, color=~as.factor(item_category_id), type='bar', hoverinfo='text', text=~paste(item_category_name,':',qty_vol)) %>%
layout(title="Top 10 Selling Categories", xaxis=list(title="Category"), yaxis=list(title="Quantity Sold"), showlegend=FALSE)
# Monthly sales for each year
year_month_sales = merged_sales %>%
group_by(year, month) %>%
summarise(sales_vol = sum(item_cnt_day * item_price)) %>%
arrange(year)
year_month_sales
`summarise()` regrouping output by 'year' (override with `.groups` argument)
| year | month | sales_vol |
|---|---|---|
| <fct> | <fct> | <dbl> |
| 2013 | 1 | 91947091 |
| 2013 | 2 | 90665707 |
| 2013 | 3 | 104932687 |
| 2013 | 4 | 69154286 |
| 2013 | 5 | 65065312 |
| 2013 | 6 | 79550406 |
| 2013 | 7 | 69350600 |
| 2013 | 8 | 74297970 |
| 2013 | 9 | 119081892 |
| 2013 | 10 | 107241613 |
| 2013 | 11 | 138665439 |
| 2013 | 12 | 207571731 |
| 2014 | 1 | 98226023 |
| 2014 | 2 | 107431123 |
| 2014 | 3 | 109115405 |
| 2014 | 4 | 80651755 |
| 2014 | 5 | 91281734 |
| 2014 | 6 | 82668539 |
| 2014 | 7 | 76487921 |
| 2014 | 8 | 89604092 |
| 2014 | 9 | 114723594 |
| 2014 | 10 | 112525522 |
| 2014 | 11 | 152037599 |
| 2014 | 12 | 232025171 |
| 2015 | 1 | 118927700 |
| 2015 | 2 | 86593884 |
| 2015 | 3 | 86476267 |
| 2015 | 4 | 90252814 |
| 2015 | 5 | 88577438 |
| 2015 | 6 | 65397995 |
| 2015 | 7 | 59405787 |
| 2015 | 8 | 61748965 |
| 2015 | 9 | 92905169 |
| 2015 | 10 | 84337113 |
# Plot for sales by volume vs year with montly bifurcation
plot_ly(year_month_sales, x=~year, y=~sales_vol, color=~month, type='bar', hoverinfo='text', text=~paste(month.name[month],':',round(sales_vol))) %>%
layout(title="Sales year-month Wise", xaxis=list(title="Year"), yaxis=list(title="Sales Volume"), barmode='stack', legend=list(title=list(text='<b>Month</b>')))
# Total yearly sales
year_sales = merged_sales %>%
group_by(year) %>%
summarise(sales_vol_year = sum(item_cnt_day * item_price))
year_sales
`summarise()` ungrouping output (override with `.groups` argument)
| year | sales_vol_year |
|---|---|
| <fct> | <dbl> |
| 2013 | 1217524734 |
| 2014 | 1346778479 |
| 2015 | 834623132 |
# Plot for total sales vs year
plot_ly(year_sales, x=~year, y=~sales_vol_year, type='bar', color=~year, hoverinfo='text', text=~paste(year,':',round(sales_vol_year))) %>%
layout(title="Total sales per Year", xaxis=list(title="Year"), yaxis=list(title="Total Sales"), legend=list(title=list(text='<b>Year</b>')))
# Find the best selling product by quantity
prod_popularity_years = merged_sales %>%
group_by(year, item_id) %>%
summarise(qty_vol = sum(item_cnt_day)) %>%
filter(qty_vol == max(qty_vol))
prod_popularity_years
`summarise()` regrouping output by 'year' (override with `.groups` argument)
| year | item_id | qty_vol |
|---|---|---|
| <fct> | <fct> | <dbl> |
| 2013 | 20949 | 63748 |
| 2014 | 20949 | 85288 |
| 2015 | 20949 | 38606 |
# Plot for the quantity of the most sold product vs year
plot_ly(prod_popularity_years, x=~year, y=~qty_vol, type='bar', hoverinfo='text', text=~paste(year,', Item',item_id,':',qty_vol)) %>%
layout(title="Most Popular Product per year", xaxis=~list(title="Year"), yaxis=~list(title="Quantity Sold"))
# Find the shop with highest sales volume
sales_volume = merged_sales %>%
group_by(shop_id,shop_name) %>%
summarise(sales_vol = sum(item_cnt_day * item_price)) %>%
arrange(desc(sales_vol))
sales_volume
`summarise()` regrouping output by 'shop_id' (override with `.groups` argument)
| shop_id | shop_name | sales_vol |
|---|---|---|
| <fct> | <chr> | <dbl> |
| 31 | <U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0426> "<U+0421><U+0435><U+043C><U+0435><U+043D><U+043E><U+0432><U+0441><U+043A><U+0438><U+0439>" | 235217019.1 |
| 25 | <U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0420><U+041A> "<U+0410><U+0442><U+0440><U+0438><U+0443><U+043C>" | 216480571.4 |
| 28 | <U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0426> "<U+041C><U+0415><U+0413><U+0410> <U+0422><U+0435><U+043F><U+043B><U+044B><U+0439> <U+0421><U+0442><U+0430><U+043D>" II | 159746337.4 |
| 42 | <U+0421><U+041F><U+0431> <U+0422><U+041A> "<U+041D><U+0435><U+0432><U+0441><U+043A><U+0438><U+0439> <U+0426><U+0435><U+043D><U+0442><U+0440>" | 151917032.5 |
| 54 | <U+0425><U+0438><U+043C><U+043A><U+0438> <U+0422><U+0426> "<U+041C><U+0435><U+0433><U+0430>" | 141939673.0 |
| 27 | <U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0426> "<U+041C><U+0415><U+0413><U+0410> <U+0411><U+0435><U+043B><U+0430><U+044F> <U+0414><U+0430><U+0447><U+0430> II" | 140632474.1 |
| 57 | <U+042F><U+043A><U+0443><U+0442><U+0441><U+043A> <U+041E><U+0440><U+0434><U+0436><U+043E><U+043D><U+0438><U+043A><U+0438><U+0434><U+0437><U+0435>, 56 | 112999783.5 |
| 12 | <U+0418><U+043D><U+0442><U+0435><U+0440><U+043D><U+0435><U+0442>-<U+043C><U+0430><U+0433><U+0430><U+0437><U+0438><U+043D> <U+0427><U+0421> | 112205072.9 |
| 6 | <U+0412><U+043E><U+0440><U+043E><U+043D><U+0435><U+0436> (<U+041F><U+043B><U+0435><U+0445><U+0430><U+043D><U+043E><U+0432><U+0441><U+043A><U+0430><U+044F>, 13) | 98067453.8 |
| 18 | <U+041A><U+0440><U+0430><U+0441><U+043D><U+043E><U+044F><U+0440><U+0441><U+043A> <U+0422><U+0426> "<U+0418><U+044E><U+043D><U+044C>" | 83392701.0 |
| 50 | <U+0422><U+044E><U+043C><U+0435><U+043D><U+044C> <U+0422><U+0426> "<U+0413><U+0443><U+0434><U+0432><U+0438><U+043D>" | 77950597.5 |
| 47 | <U+0421><U+0443><U+0440><U+0433><U+0443><U+0442> <U+0422><U+0420><U+0426> "<U+0421><U+0438><U+0442><U+0438> <U+041C><U+043E><U+043B><U+043B>" | 74882096.2 |
| 46 | <U+0421><U+0435><U+0440><U+0433><U+0438><U+0435><U+0432> <U+041F><U+043E><U+0441><U+0430><U+0434> <U+0422><U+0426> "7<U+042F>" | 74189318.9 |
| 35 | <U+041D>.<U+041D><U+043E><U+0432><U+0433><U+043E><U+0440><U+043E><U+0434> <U+0422><U+0420><U+0426> "<U+0424><U+0430><U+043D><U+0442><U+0430><U+0441><U+0442><U+0438><U+043A><U+0430>" | 70204774.9 |
| 24 | <U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+041A> "<U+0411><U+0443><U+0434><U+0435><U+043D><U+043E><U+0432><U+0441><U+043A><U+0438><U+0439>" (<U+043F><U+0430><U+0432>.<U+041A>7) | 68811693.5 |
| 58 | <U+042F><U+043A><U+0443><U+0442><U+0441><U+043A> <U+0422><U+0426> "<U+0426><U+0435><U+043D><U+0442><U+0440><U+0430><U+043B><U+044C><U+043D><U+044B><U+0439>" | 68737617.8 |
| 15 | <U+041A><U+0430><U+043B><U+0443><U+0433><U+0430> <U+0422><U+0420><U+0426> "XXI <U+0432><U+0435><U+043A>" | 68710368.4 |
| 26 | <U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0426> "<U+0410><U+0440><U+0435><U+0430><U+043B>" (<U+0411><U+0435><U+043B><U+044F><U+0435><U+0432><U+043E>) | 67551089.6 |
| 7 | <U+0412><U+043E><U+0440><U+043E><U+043D><U+0435><U+0436> <U+0422><U+0420><U+0426> "<U+041C><U+0430><U+043A><U+0441><U+0438><U+043C><U+0438><U+0440>" | 66957658.6 |
| 38 | <U+041E><U+043C><U+0441><U+043A> <U+0422><U+0426> "<U+041C><U+0435><U+0433><U+0430>" | 65766434.4 |
| 19 | <U+041A><U+0443><U+0440><U+0441><U+043A> <U+0422><U+0426> "<U+041F><U+0443><U+0448><U+043A><U+0438><U+043D><U+0441><U+043A><U+0438><U+0439>" | 64953921.6 |
| 21 | <U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+041C><U+0422><U+0420><U+0426> "<U+0410><U+0444><U+0438> <U+041C><U+043E><U+043B><U+043B>" | 62581387.8 |
| 43 | <U+0421><U+041F><U+0431> <U+0422><U+041A> "<U+0421><U+0435><U+043D><U+043D><U+0430><U+044F>" | 62217699.4 |
| 22 | <U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+041C><U+0430><U+0433><U+0430><U+0437><U+0438><U+043D> <U+0421>21 | 60438890.2 |
| 56 | <U+0427><U+0435><U+0445><U+043E><U+0432> <U+0422><U+0420><U+0426> "<U+041A><U+0430><U+0440><U+043D><U+0430><U+0432><U+0430><U+043B>" | 58605903.5 |
| 16 | <U+041A><U+043E><U+043B><U+043E><U+043C><U+043D><U+0430> <U+0422><U+0426> "<U+0420><U+0438><U+043E>" | 57436486.6 |
| 29 | <U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0426> "<U+041D><U+043E><U+0432><U+044B><U+0439> <U+0432><U+0435><U+043A>" (<U+041D><U+043E><U+0432><U+043E><U+043A><U+043E><U+0441><U+0438><U+043D><U+043E>) | 56550135.5 |
| 53 | <U+0423><U+0444><U+0430> <U+0422><U+0426> "<U+0421><U+0435><U+043C><U+044C><U+044F>" 2 | 56511041.7 |
| 55 | <U+0426><U+0438><U+0444><U+0440><U+043E><U+0432><U+043E><U+0439> <U+0441><U+043A><U+043B><U+0430><U+0434> 1<U+0421>-<U+041E><U+043D><U+043B><U+0430><U+0439><U+043D> | 49792061.5 |
| 14 | <U+041A><U+0430><U+0437><U+0430><U+043D><U+044C> <U+0422><U+0426> "<U+041F><U+0430><U+0440><U+043A><U+0425><U+0430><U+0443><U+0441>" II | 46737281.5 |
| 30 | <U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0426> "<U+041F><U+0435><U+0440><U+043B><U+043E><U+0432><U+0441><U+043A><U+0438><U+0439>" | 46702461.4 |
| 41 | <U+0420><U+043E><U+0441><U+0442><U+043E><U+0432><U+041D><U+0430><U+0414><U+043E><U+043D><U+0443> <U+0422><U+0426> "<U+041C><U+0435><U+0433><U+0430>" | 46092990.5 |
| 37 | <U+041D><U+043E><U+0432><U+043E><U+0441><U+0438><U+0431><U+0438><U+0440><U+0441><U+043A> <U+0422><U+0426> "<U+041C><U+0435><U+0433><U+0430>" | 45974685.0 |
| 59 | <U+042F><U+0440><U+043E><U+0441><U+043B><U+0430><U+0432><U+043B><U+044C> <U+0422><U+0426> "<U+0410><U+043B><U+044C><U+0442><U+0430><U+0438><U+0440>" | 45226578.9 |
| 52 | <U+0423><U+0444><U+0430> <U+0422><U+041A> "<U+0426><U+0435><U+043D><U+0442><U+0440><U+0430><U+043B><U+044C><U+043D><U+044B><U+0439>" | 45054241.7 |
| 2 | <U+0410><U+0434><U+044B><U+0433><U+0435><U+044F> <U+0422><U+0426> "<U+041C><U+0435><U+0433><U+0430>" | 44049638.8 |
| 45 | <U+0421><U+0430><U+043C><U+0430><U+0440><U+0430> <U+0422><U+0426> "<U+041F><U+0430><U+0440><U+043A><U+0425><U+0430><U+0443><U+0441>" | 40924797.5 |
| 4 | <U+0412><U+043E><U+043B><U+0436><U+0441><U+043A><U+0438><U+0439> <U+0422><U+0426> "<U+0412><U+043E><U+043B><U+0433><U+0430> <U+041C><U+043E><U+043B><U+043B>" | 40539650.3 |
| 5 | <U+0412><U+043E><U+043B><U+043E><U+0433><U+0434><U+0430> <U+0422><U+0420><U+0426> "<U+041C><U+0430><U+0440><U+043C><U+0435><U+043B><U+0430><U+0434>" | 38160429.0 |
| 44 | <U+0421><U+0430><U+043C><U+0430><U+0440><U+0430> <U+0422><U+0426> "<U+041C><U+0435><U+043B><U+043E><U+0434><U+0438><U+044F>" | 34204028.3 |
| 3 | <U+0411><U+0430><U+043B><U+0430><U+0448><U+0438><U+0445><U+0430> <U+0422><U+0420><U+041A> "<U+041E><U+043A><U+0442><U+044F><U+0431><U+0440><U+044C>-<U+041A><U+0438><U+043D><U+043E><U+043C><U+0438><U+0440>" | 30140853.9 |
| 17 | <U+041A><U+0440><U+0430><U+0441><U+043D><U+043E><U+044F><U+0440><U+0441><U+043A> <U+0422><U+0426> "<U+0412><U+0437><U+043B><U+0435><U+0442><U+043A><U+0430> <U+041F><U+043B><U+0430><U+0437><U+0430>" | 27784948.3 |
| 48 | <U+0422><U+043E><U+043C><U+0441><U+043A> <U+0422><U+0420><U+0426> "<U+0418><U+0437><U+0443><U+043C><U+0440><U+0443><U+0434><U+043D><U+044B><U+0439> <U+0413><U+043E><U+0440><U+043E><U+0434>" | 27714110.0 |
| 51 | <U+0422><U+044E><U+043C><U+0435><U+043D><U+044C> <U+0422><U+0426> "<U+0417><U+0435><U+043B><U+0435><U+043D><U+044B><U+0439> <U+0411><U+0435><U+0440><U+0435><U+0433>" | 24205161.2 |
| 49 | <U+0422><U+044E><U+043C><U+0435><U+043D><U+044C> <U+0422><U+0420><U+0426> "<U+041A><U+0440><U+0438><U+0441><U+0442><U+0430><U+043B><U+043B>" | 20140923.5 |
| 10 | <U+0416><U+0443><U+043A><U+043E><U+0432><U+0441><U+043A><U+0438><U+0439> <U+0443><U+043B>. <U+0427><U+043A><U+0430><U+043B><U+043E><U+0432><U+0430> 39<U+043C>? | 19410283.4 |
| 39 | <U+0420><U+043E><U+0441><U+0442><U+043E><U+0432><U+041D><U+0430><U+0414><U+043E><U+043D><U+0443> <U+0422><U+0420><U+041A> "<U+041C><U+0435><U+0433><U+0430><U+0446><U+0435><U+043D><U+0442><U+0440> <U+0413><U+043E><U+0440><U+0438><U+0437><U+043E><U+043D><U+0442>" | 18092891.0 |
| 9 | <U+0412><U+044B><U+0435><U+0437><U+0434><U+043D><U+0430><U+044F> <U+0422><U+043E><U+0440><U+0433><U+043E><U+0432><U+043B><U+044F> | 16372837.0 |
| 34 | <U+041D>.<U+041D><U+043E><U+0432><U+0433><U+043E><U+0440><U+043E><U+0434> <U+0422><U+0420><U+0426> "<U+0420><U+0418><U+041E>" | 8582822.0 |
| 0 | !<U+042F><U+043A><U+0443><U+0442><U+0441><U+043A> <U+041E><U+0440><U+0434><U+0436><U+043E><U+043D><U+0438><U+043A><U+0438><U+0434><U+0437><U+0435>, 56 <U+0444><U+0440><U+0430><U+043D> | 6637370.0 |
| 20 | <U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> "<U+0420><U+0430><U+0441><U+043F><U+0440><U+043E><U+0434><U+0430><U+0436><U+0430>" | 6599924.0 |
| 13 | <U+041A><U+0430><U+0437><U+0430><U+043D><U+044C> <U+0422><U+0426> "<U+0411><U+0435><U+0445><U+0435><U+0442><U+043B><U+0435>" | 6006173.2 |
| 33 | <U+041C><U+044B><U+0442><U+0438><U+0449><U+0438> <U+0422><U+0420><U+041A> "XL-3" | 5949101.6 |
| 32 | <U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+0426> "<U+0421><U+0435><U+0440><U+0435><U+0431><U+0440><U+044F><U+043D><U+044B><U+0439> <U+0414><U+043E><U+043C>" | 5772824.0 |
| 23 | <U+041C><U+043E><U+0441><U+043A><U+0432><U+0430> <U+0422><U+041A> "<U+0411><U+0443><U+0434><U+0435><U+043D><U+043E><U+0432><U+0441><U+043A><U+0438><U+0439>" (<U+043F><U+0430><U+0432>.<U+0410>2) | 5621521.2 |
| 40 | <U+0420><U+043E><U+0441><U+0442><U+043E><U+0432><U+041D><U+0430><U+0414><U+043E><U+043D><U+0443> <U+0422><U+0420><U+041A> "<U+041C><U+0435><U+0433><U+0430><U+0446><U+0435><U+043D><U+0442><U+0440> <U+0413><U+043E><U+0440><U+0438><U+0437><U+043E><U+043D><U+0442>" <U+041E><U+0441><U+0442><U+0440><U+043E><U+0432><U+043D><U+043E><U+0439> | 4293586.8 |
| 1 | !<U+042F><U+043A><U+0443><U+0442><U+0441><U+043A> <U+0422><U+0426> "<U+0426><U+0435><U+043D><U+0442><U+0440><U+0430><U+043B><U+044C><U+043D><U+044B><U+0439>" <U+0444><U+0440><U+0430><U+043D> | 3238207.0 |
| 8 | <U+0412><U+043E><U+0440><U+043E><U+043D><U+0435><U+0436> <U+0422><U+0420><U+0426> <U+0421><U+0438><U+0442><U+0438>-<U+041F><U+0430><U+0440><U+043A> "<U+0413><U+0440><U+0430><U+0434>" | 2349358.0 |
| 11 | <U+0416><U+0443><U+043A><U+043E><U+0432><U+0441><U+043A><U+0438><U+0439> <U+0443><U+043B>. <U+0427><U+043A><U+0430><U+043B><U+043E><U+0432><U+0430> 39<U+043C>² | 521655.1 |
| 36 | <U+041D><U+043E><U+0432><U+043E><U+0441><U+0438><U+0431><U+0438><U+0440><U+0441><U+043A> <U+0422><U+0420><U+0426> "<U+0413><U+0430><U+043B><U+0435><U+0440><U+0435><U+044F> <U+041D><U+043E><U+0432><U+043E><U+0441><U+0438><U+0431><U+0438><U+0440><U+0441><U+043A>" | 377714.0 |
# Plot for shop vs sales volume
plot_ly(sales_volume, y=~reorder(as.factor(shop_id) , sales_vol), x=~sales_vol, color=~as.factor(shop_id), type='bar', hoverinfo='text', text=~paste('Shop',shop_id,':',round(sales_vol)), orientation='h') %>%
layout(title="Sales Volume", yaxis=list(title="Shop"), xaxis=list(title="Sales Volume(Amount)"), legend=list(title=list(text='<b>Shop ID</b>')))
# We will now convert our dataframe into a data table which makes it easier for further process
library(data.table)
train_datatable = as.data.table(merged_sales)
monthly_shop_data = train_datatable[, list(item_cnt_month=(sum(item_cnt_day))/12), by = c("date_block_num", "month","shop_id", "item_category_id", "item_id", "item_price")]
summary(monthly_shop_data)
head(monthly_shop_data)
nrow(monthly_shop_data)
Attaching package: 'data.table'
The following objects are masked from 'package:lubridate':
hour, isoweek, mday, minute, month, quarter, second, wday, week,
yday, year
The following objects are masked from 'package:dplyr':
between, first, last
The following object is masked from 'package:purrr':
transpose
date_block_num month shop_id item_category_id
Min. : 0.00 1 :175576 31 : 109408 40 :343281
1st Qu.: 6.00 3 :167065 25 : 95955 55 :249575
Median :14.00 2 :163914 54 : 75612 30 :152805
Mean :14.75 8 :152691 28 : 70733 37 :143685
3rd Qu.:23.00 6 :145226 57 : 64235 19 :113445
Max. :33.00 7 :144487 42 : 62337 23 : 86916
(Other):790063 (Other):1260742 (Other):649315
item_id item_price item_cnt_month
17717 : 7475 Min. : -1.0 Min. : -1.83333
7893 : 2009 1st Qu.: 199.0 1st Qu.: 0.08333
2445 : 1993 Median : 399.0 Median : 0.08333
4244 : 1941 Mean : 845.8 Mean : 0.17482
6675 : 1856 3rd Qu.: 899.0 3rd Qu.: 0.16667
4248 : 1833 Max. :307980.0 Max. :180.75000
(Other):1721915
| date_block_num | month | shop_id | item_category_id | item_id | item_price | item_cnt_month |
|---|---|---|---|---|---|---|
| <dbl> | <fct> | <fct> | <fct> | <fct> | <dbl> | <dbl> |
| 0 | 1 | 59 | 37 | 22154 | 999.00 | 0.08333333 |
| 0 | 1 | 25 | 58 | 2552 | 899.00 | 0.00000000 |
| 0 | 1 | 25 | 58 | 2554 | 1709.05 | 0.08333333 |
| 0 | 1 | 25 | 56 | 2555 | 1099.00 | 0.08333333 |
| 0 | 1 | 25 | 59 | 2564 | 349.00 | 0.08333333 |
| 0 | 1 | 25 | 56 | 2565 | 549.00 | 0.08333333 |
memory.size()
memory.limit()
memory.limit(size=56000)
library(caret)
set.seed(100)
# Divinding the data in 70/30 ratio into training and testing data
summarized_data = createDataPartition(y = monthly_shop_data$item_cnt_month, p = 0.7, list = F)
sum_training = monthly_shop_data[summarized_data, ]
testing = monthly_shop_data[-summarized_data, ]
Loading required package: lattice
Attaching package: 'caret'
The following object is masked from 'package:purrr':
lift
This validation data helps us determine weather the trained model is good or not.
set.seed(100)
# Divinding the training data further in 80/20 ratio into training and validation data
training_data = createDataPartition(y = sum_training$item_cnt_month, p = 0.8, list = F)
training = sum_training[training_data,]
validation = sum_training[-training_data,]
nrow(training)
nrow(validation)
nrow(testing)
model = train(item_cnt_month~date_block_num+month+shop_id+item_category_id, data=training, method="glm", trControl=trainControl(method="cv",number=5))
We are using the glm method for training and for trainControl we are using 5-fold cross validation.
summary(model)
Call:
NULL
Deviance Residuals:
Min 1Q Median 3Q Max
-12.588 -0.087 -0.034 0.016 179.965
Coefficients: (2 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.240e-01 3.865e-01 0.321 0.748275
date_block_num -1.157e-03 6.477e-05 -17.870 < 2e-16 ***
month2 -4.791e-03 2.511e-03 -1.908 0.056424 .
month3 2.880e-03 2.507e-03 1.149 0.250607
month4 -4.369e-03 2.627e-03 -1.663 0.096294 .
month5 -1.238e-02 2.633e-03 -4.702 2.57e-06 ***
month6 -7.247e-03 2.613e-03 -2.773 0.005553 **
month7 -1.084e-02 2.622e-03 -4.132 3.59e-05 ***
month8 -1.066e-02 2.593e-03 -4.111 3.93e-05 ***
month9 1.467e-02 2.698e-03 5.435 5.47e-08 ***
month10 1.470e-02 2.729e-03 5.384 7.27e-08 ***
month11 3.087e-02 2.897e-03 10.659 < 2e-16 ***
month12 4.367e-02 2.659e-03 16.426 < 2e-16 ***
shop_id1 -4.429e-02 1.664e-02 -2.662 0.007779 **
shop_id2 -9.009e-02 1.205e-02 -7.477 7.59e-14 ***
shop_id3 -9.635e-02 1.197e-02 -8.049 8.37e-16 ***
shop_id4 -7.553e-02 1.157e-02 -6.527 6.71e-11 ***
shop_id5 -7.858e-02 1.159e-02 -6.777 1.23e-11 ***
shop_id6 -3.759e-02 1.112e-02 -3.380 0.000725 ***
shop_id7 -5.579e-02 1.132e-02 -4.928 8.29e-07 ***
shop_id8 -8.616e-02 1.783e-02 -4.833 1.34e-06 ***
shop_id9 4.151e-01 2.014e-02 20.610 < 2e-16 ***
shop_id10 -1.076e-01 1.214e-02 -8.864 < 2e-16 ***
shop_id11 -9.053e-02 3.653e-02 -2.478 0.013212 *
shop_id12 7.741e-03 1.176e-02 0.659 0.510194
shop_id13 -7.330e-02 1.249e-02 -5.871 4.33e-09 ***
shop_id14 -6.273e-02 1.161e-02 -5.401 6.63e-08 ***
shop_id15 -4.282e-02 1.134e-02 -3.777 0.000159 ***
shop_id16 -6.473e-02 1.135e-02 -5.701 1.19e-08 ***
shop_id17 -7.867e-02 1.211e-02 -6.496 8.23e-11 ***
shop_id18 -4.786e-02 1.136e-02 -4.214 2.51e-05 ***
shop_id19 -5.227e-02 1.126e-02 -4.644 3.42e-06 ***
shop_id20 3.378e-01 2.693e-02 12.541 < 2e-16 ***
shop_id21 -5.995e-02 1.126e-02 -5.323 1.02e-07 ***
shop_id22 -6.554e-02 1.153e-02 -5.685 1.31e-08 ***
shop_id23 -6.546e-02 1.512e-02 -4.328 1.51e-05 ***
shop_id24 -4.813e-02 1.140e-02 -4.220 2.44e-05 ***
shop_id25 2.060e-02 1.090e-02 1.890 0.058731 .
shop_id26 -5.195e-02 1.132e-02 -4.587 4.49e-06 ***
shop_id27 1.083e-02 1.109e-02 0.977 0.328772
shop_id28 2.207e-02 1.099e-02 2.009 0.044569 *
shop_id29 -4.919e-02 1.145e-02 -4.297 1.73e-05 ***
shop_id30 -5.416e-02 1.135e-02 -4.771 1.83e-06 ***
shop_id31 4.827e-02 1.087e-02 4.442 8.91e-06 ***
shop_id32 -8.848e-02 1.432e-02 -6.177 6.53e-10 ***
shop_id33 -1.021e-01 1.615e-02 -6.320 2.61e-10 ***
shop_id34 -1.275e-01 1.537e-02 -8.295 < 2e-16 ***
shop_id35 -8.335e-02 1.126e-02 -7.402 1.34e-13 ***
shop_id36 -1.676e-01 4.567e-02 -3.669 0.000243 ***
shop_id37 -6.952e-02 1.151e-02 -6.038 1.56e-09 ***
shop_id38 -6.764e-02 1.147e-02 -5.895 3.74e-09 ***
shop_id39 -8.867e-02 1.288e-02 -6.884 5.82e-12 ***
shop_id40 -8.852e-02 1.860e-02 -4.760 1.94e-06 ***
shop_id41 -7.089e-02 1.152e-02 -6.154 7.58e-10 ***
shop_id42 -3.895e-03 1.104e-02 -0.353 0.724140
shop_id43 -4.475e-02 1.163e-02 -3.847 0.000119 ***
shop_id44 -7.637e-02 1.150e-02 -6.642 3.10e-11 ***
shop_id45 -7.786e-02 1.162e-02 -6.703 2.05e-11 ***
shop_id46 -4.219e-02 1.125e-02 -3.750 0.000177 ***
shop_id47 -5.077e-02 1.135e-02 -4.474 7.68e-06 ***
shop_id48 -6.718e-02 1.229e-02 -5.465 4.63e-08 ***
shop_id49 -1.021e-01 1.263e-02 -8.089 6.03e-16 ***
shop_id50 -4.213e-02 1.126e-02 -3.742 0.000182 ***
shop_id51 -6.771e-02 1.142e-02 -5.931 3.02e-09 ***
shop_id52 -7.226e-02 1.146e-02 -6.305 2.89e-10 ***
shop_id53 -5.786e-02 1.140e-02 -5.077 3.83e-07 ***
shop_id54 1.369e-02 1.095e-02 1.250 0.211249
shop_id55 1.129e+01 1.143e-01 98.761 < 2e-16 ***
shop_id56 -5.664e-02 1.117e-02 -5.071 3.95e-07 ***
shop_id57 -1.055e-02 1.103e-02 -0.956 0.338989
shop_id58 -4.030e-02 1.125e-02 -3.583 0.000340 ***
shop_id59 -7.366e-02 1.153e-02 -6.388 1.68e-10 ***
item_category_id1 NA NA NA NA
item_category_id2 5.886e-02 3.866e-01 0.152 0.878979
item_category_id3 1.644e-01 3.866e-01 0.425 0.670735
item_category_id4 6.264e-02 3.870e-01 0.162 0.871422
item_category_id5 3.656e-02 3.867e-01 0.095 0.924672
item_category_id6 9.355e-02 3.866e-01 0.242 0.808806
item_category_id7 7.516e-02 3.868e-01 0.194 0.845910
item_category_id8 1.558e+00 3.882e-01 4.014 5.96e-05 ***
item_category_id9 6.766e-01 3.869e-01 1.749 0.080312 .
item_category_id10 NA NA NA NA
item_category_id11 3.860e-02 3.867e-01 0.100 0.920491
item_category_id12 2.768e-01 3.868e-01 0.716 0.474127
item_category_id13 1.831e-02 3.889e-01 0.047 0.962446
item_category_id14 2.769e-02 3.868e-01 0.072 0.942943
item_category_id15 5.366e-02 3.867e-01 0.139 0.889648
item_category_id16 5.917e-02 3.869e-01 0.153 0.878468
item_category_id17 5.909e-03 3.888e-01 0.015 0.987873
item_category_id18 7.367e-02 4.574e-01 0.161 0.872027
item_category_id19 1.187e-01 3.865e-01 0.307 0.758776
item_category_id20 2.589e-01 3.866e-01 0.670 0.502971
item_category_id21 5.238e-02 3.866e-01 0.136 0.892209
item_category_id22 6.543e-02 3.866e-01 0.169 0.865606
item_category_id23 9.445e-02 3.865e-01 0.244 0.806970
item_category_id24 7.239e-02 3.866e-01 0.187 0.851462
item_category_id25 4.338e-02 3.866e-01 0.112 0.910653
item_category_id26 -1.094e+01 4.286e-01 -25.518 < 2e-16 ***
item_category_id27 -1.130e+01 5.118e-01 -22.079 < 2e-16 ***
item_category_id28 1.921e-01 3.866e-01 0.497 0.619237
item_category_id29 2.149e-01 3.866e-01 0.556 0.578271
item_category_id30 1.809e-01 3.865e-01 0.468 0.639745
item_category_id31 -1.109e+01 4.030e-01 -27.511 < 2e-16 ***
item_category_id32 2.469e-01 3.869e-01 0.638 0.523282
item_category_id33 1.044e-01 3.867e-01 0.270 0.787188
item_category_id34 -6.346e+00 4.100e-01 -15.479 < 2e-16 ***
item_category_id35 3.799e-01 3.867e-01 0.983 0.325835
item_category_id36 -1.118e+01 4.713e-01 -23.727 < 2e-16 ***
item_category_id37 3.952e-02 3.865e-01 0.102 0.918569
item_category_id38 3.783e-02 3.866e-01 0.098 0.922047
item_category_id39 2.575e-02 3.879e-01 0.066 0.947075
item_category_id40 7.569e-02 3.865e-01 0.196 0.844749
item_category_id41 2.893e-02 3.866e-01 0.075 0.940336
item_category_id42 2.162e-01 3.871e-01 0.559 0.576453
item_category_id43 1.395e-03 3.866e-01 0.004 0.997121
item_category_id44 -1.130e+01 4.059e-01 -27.836 < 2e-16 ***
item_category_id45 -2.143e-03 3.867e-01 -0.006 0.995579
item_category_id46 3.667e-03 4.322e-01 0.008 0.993230
item_category_id47 1.122e-01 3.868e-01 0.290 0.771780
item_category_id48 -6.793e-03 4.990e-01 -0.014 0.989138
item_category_id49 9.056e-02 3.866e-01 0.234 0.814795
item_category_id50 1.358e-01 5.467e-01 0.248 0.803823
item_category_id51 5.859e-02 6.695e-01 0.088 0.930255
item_category_id52 5.183e-02 5.466e-01 0.095 0.924459
item_category_id53 -3.406e-02 6.695e-01 -0.051 0.959418
item_category_id54 -1.074e+01 4.037e-01 -26.603 < 2e-16 ***
item_category_id55 4.188e-02 3.865e-01 0.108 0.913716
item_category_id56 1.612e-02 3.866e-01 0.042 0.966738
item_category_id57 1.569e-02 3.866e-01 0.041 0.967621
item_category_id58 -3.676e-04 3.866e-01 -0.001 0.999241
item_category_id59 -1.082e-02 3.867e-01 -0.028 0.977678
item_category_id60 -3.018e-03 3.870e-01 -0.008 0.993778
item_category_id61 3.000e-02 3.866e-01 0.078 0.938149
item_category_id62 5.092e-02 3.866e-01 0.132 0.895212
item_category_id63 7.000e-02 3.866e-01 0.181 0.856299
item_category_id64 5.896e-02 3.866e-01 0.153 0.878779
item_category_id65 1.637e-01 3.866e-01 0.423 0.672006
item_category_id66 1.042e-01 3.868e-01 0.269 0.787730
item_category_id67 3.908e-02 3.866e-01 0.101 0.919472
item_category_id68 -6.079e-02 3.930e-01 -0.155 0.877084
item_category_id69 8.686e-02 3.866e-01 0.225 0.822222
item_category_id70 1.290e-01 3.866e-01 0.334 0.738614
item_category_id71 9.682e+00 3.870e-01 25.018 < 2e-16 ***
item_category_id72 5.562e-02 3.866e-01 0.144 0.885584
item_category_id73 1.155e-01 3.867e-01 0.299 0.765192
item_category_id74 -1.115e+01 4.305e-01 -25.889 < 2e-16 ***
item_category_id75 1.167e-01 3.866e-01 0.302 0.762799
item_category_id76 -1.110e+01 4.034e-01 -27.518 < 2e-16 ***
item_category_id77 6.641e-02 3.868e-01 0.172 0.863696
item_category_id78 -1.127e+01 4.034e-01 -27.935 < 2e-16 ***
item_category_id79 1.145e-01 3.866e-01 0.296 0.767151
item_category_id80 1.207e+00 3.883e-01 3.108 0.001884 **
item_category_id81 1.662e-01 3.884e-01 0.428 0.668697
item_category_id82 1.015e+00 3.873e-01 2.620 0.008790 **
item_category_id83 3.187e-01 3.868e-01 0.824 0.410037
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for gaussian family taken to be 0.2987583)
Null deviance: 382574 on 973854 degrees of freedom
Residual deviance: 290902 on 973702 degrees of freedom
AIC: 1587302
Number of Fisher Scoring iterations: 2
# Predicting the data from validation set
p = predict(model, validation)
error = (p - validation$item_cnt_month)
# Comparing the RMSE error from the model and the validation set
RMSE_model = model$results$RMSE;
cat("RMSE error of the model:",RMSE_model)
RMSE_new = sqrt(mean(error^2));
cat("\nRMSE error of the predicted values:",RMSE_new)
RMSE error of the model: 0.5454389 RMSE error of the predicted values: 0.5224954
# Predicting the test set
prediction = predict(model, testing)
# Determine the error in the test set
test_error = (prediction - testing$item_cnt_month)
RMSE_test = sqrt(mean(test_error^2))
cat("RMSE error for the test data:",RMSE_test)
RMSE error for the test data: 0.5833928
Now we imprt the test data from the CSV file from which we have the predict the data.
# import test data to be predicted
test_data = read_csv("test.csv")
# Assign 11 to month for November
test_data$month = 11
test_data$month = as.factor(test_data$month)
test_data$shop_id = as.factor(test_data$shop_id)
# Assign 34 to date_block_num for Nov 2015
test_data$date_block_num = 34
head(test_data)
Parsed with column specification: cols( ID = col_double(), shop_id = col_double(), item_id = col_double() )
| ID | shop_id | item_id | month | date_block_num |
|---|---|---|---|---|
| <dbl> | <fct> | <dbl> | <fct> | <dbl> |
| 0 | 5 | 5037 | 11 | 34 |
| 1 | 5 | 5320 | 11 | 34 |
| 2 | 5 | 5233 | 11 | 34 |
| 3 | 5 | 5232 | 11 | 34 |
| 4 | 5 | 5268 | 11 | 34 |
| 5 | 5 | 5039 | 11 | 34 |
#combine test_data and items_data by item_id to get item_category_id
combined_test = left_join(x=test_data, y = items, by = c("item_id"))
combined_test$item_category_id = as.factor(combined_test$item_category_id)
head(combined_test)
| ID | shop_id | item_id | month | date_block_num | item_name | item_category_id |
|---|---|---|---|---|---|---|
| <dbl> | <fct> | <dbl> | <fct> | <dbl> | <chr> | <fct> |
| 0 | 5 | 5037 | 11 | 34 | NHL 15 [PS3, <U+0440><U+0443><U+0441><U+0441><U+043A><U+0438><U+0435> <U+0441><U+0443><U+0431><U+0442><U+0438><U+0442><U+0440><U+044B>] | 19 |
| 1 | 5 | 5320 | 11 | 34 | ONE DIRECTION Made In The A.M. | 55 |
| 2 | 5 | 5233 | 11 | 34 | Need for Speed Rivals (Essentials) [PS3, <U+0440><U+0443><U+0441><U+0441><U+043A><U+0430><U+044F> <U+0432><U+0435><U+0440><U+0441><U+0438><U+044F>] | 19 |
| 3 | 5 | 5232 | 11 | 34 | Need for Speed Rivals (Classics) [Xbox 360, <U+0440><U+0443><U+0441><U+0441><U+043A><U+0430><U+044F> <U+0432><U+0435><U+0440><U+0441><U+0438><U+044F>] | 23 |
| 4 | 5 | 5268 | 11 | 34 | Need for Speed [PS4, <U+0440><U+0443><U+0441><U+0441><U+043A><U+0430><U+044F> <U+0432><U+0435><U+0440><U+0441><U+0438><U+044F>] | 20 |
| 5 | 5 | 5039 | 11 | 34 | NHL 15 [Xbox 360, <U+0440><U+0443><U+0441><U+0441><U+043A><U+0438><U+0435> <U+0441><U+0443><U+0431><U+0442><U+0438><U+0442><U+0440><U+044B>] | 23 |
# Fit the model for our predictions
test_prediction = predict(model, combined_test)
# Prepart and write the predicted data as per the required format into a CSV file.
submit = cbind(combined_test[,1], test_prediction)
colnames(submit) = c("ID", "item_cnt_month")
class(submit)
submit$ID = as.integer(submit$ID)
head(submit)
write.csv(submit, "submission.csv", row.names = FALSE, quote = FALSE)
| ID | item_cnt_month | |
|---|---|---|
| <int> | <dbl> | |
| 1 | 0 | 0.15569690 |
| 2 | 1 | 0.07887593 |
| 3 | 2 | 0.15569690 |
| 4 | 3 | 0.13144073 |
| 5 | 4 | 0.29592050 |
| 6 | 5 | 0.13144073 |